From a20c9b4d4129e782129e3c97c1e6d88c864fade7 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Tue, 24 Sep 2024 15:01:58 +0800 Subject: [PATCH 01/31] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E5=90=8C=E6=97=B6=E4=B8=8A=E6=8A=A5=E6=95=85?= =?UTF-8?q?=E9=9A=9C=E8=AF=A6=E7=BB=86=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/device.go | 54 ++++++++++++++--- pkg/common/device_test.go | 6 +- pkg/common/proto.go | 26 ++++++++ pkg/device/deviceswitch/ascend_switch.go | 77 +++++++++--------------- 4 files changed, 104 insertions(+), 59 deletions(-) diff --git a/pkg/common/device.go b/pkg/common/device.go index a01c4a1e..db660535 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -16,6 +16,7 @@ package common import ( + "encoding/json" "fmt" "strconv" "strings" @@ -51,7 +52,7 @@ var ( // SwitchFaultLock is used for CurrentSwitchFault which may be used concurrence SwitchFaultLock sync.Mutex // CurrentSwitchFault store all switch fault which will be reported to device-info configmap - currentSwitchFault = make([]int64, 0, GeneralMapSize) + currentSwitchFault = make([]SwitchFaultEvent, 0, GeneralMapSize) ) // GetDeviceID get device physical id and virtual by device name @@ -98,7 +99,7 @@ func GetSwitchFaultInfo() SwitchFaultInfo { defer SwitchFaultLock.Unlock() SwitchFaultLevelMapLock.Lock() for _, code := range currentSwitchFault { - level := SwitchFaultLevelMap[code] + level := SwitchFaultLevelMap[int64(code.FaultType)] if level > maxFaultLevel { maxFaultLevel = level } @@ -116,13 +117,17 @@ func GetSwitchFaultInfo() SwitchFaultInfo { faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" } // keep those none zero codes + reportFaultCodes := make([]string, 0) - for _, code := range currentSwitchFault { - if code != 0 { - reportFaultCodes = append(reportFaultCodes, fmt.Sprintf("%08x", code)) + for _, faultInfo := range currentSwitchFault { + if faultInfo.FaultType != 0 { + faultStr, err := getSimpleSwitchFaultStr(faultInfo) + if err != nil { + continue + } + reportFaultCodes = append(reportFaultCodes, faultStr) } } - return SwitchFaultInfo{ FaultCode: reportFaultCodes, FaultLevel: faultLevel, @@ -131,15 +136,48 @@ func GetSwitchFaultInfo() SwitchFaultInfo { } } +// getSimpleSwitchFaultStr convert fault info to string to display in switch fault info configmap +func getSimpleSwitchFaultStr(faultInfo SwitchFaultEvent) (string, error) { + // todo 确定哪些字段是可以展示的 + type simpleSwitchFaultInfo struct { + EventType uint + FaultType string + PeerPortDevice uint + PeerPortId uint + SwitchChipId uint + SwitchPortId uint + Severity uint + Assertion uint + AlarmRaisedTime int64 + } + + bytes, err := json.Marshal(simpleSwitchFaultInfo{ + EventType: faultInfo.EventType, + FaultType: fmt.Sprintf("%08x", faultInfo.FaultType), + PeerPortDevice: faultInfo.PeerPortDevice, + PeerPortId: faultInfo.PeerPortId, + SwitchChipId: faultInfo.SwitchChipId, + SwitchPortId: faultInfo.SwitchPortId, + Severity: faultInfo.Severity, + Assertion: faultInfo.Assertion, + AlarmRaisedTime: faultInfo.AlarmRaisedTime, + }) + if err != nil { + hwlog.RunLog.Warnf("failed to convert fault:%#v, err: %v", faultInfo, err) + return "", err + } + return string(bytes), nil +} + // SetSwitchFaultCode set switch fault code -func SetSwitchFaultCode(newFaults []int64) { +func SetSwitchFaultCode(newFaults []SwitchFaultEvent) { SwitchFaultLock.Lock() defer SwitchFaultLock.Unlock() currentSwitchFault = newFaults } // GetSwitchFaultCode get switch fault code -func GetSwitchFaultCode() []int64 { +func GetSwitchFaultCode() []SwitchFaultEvent { return currentSwitchFault } diff --git a/pkg/common/device_test.go b/pkg/common/device_test.go index 5aac88b6..ed5811c2 100644 --- a/pkg/common/device_test.go +++ b/pkg/common/device_test.go @@ -206,15 +206,15 @@ func TestGetSwitchFaultInfo(t *testing.T) { convey.Convey("test GetSwitchFaultInfo", t, func() { ParamOption.RealCardType = common.Ascend910A3 ParamOption.EnableSwitchFault = true - currentSwitchFault = []int64{} + currentSwitchFault = []SwitchFaultEvent{} SwitchFaultLevelMap = map[int64]int{} convey.Convey("test empty SwitchFaultLevelMap", func() { - currentSwitchFault = append(currentSwitchFault, generalFaultCode) + currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{FaultType: generalFaultCode}) fault := GetSwitchFaultInfo() convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) }) convey.Convey("test actually level", func() { - currentSwitchFault = append(currentSwitchFault, generalFaultCode) + currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{FaultType: generalFaultCode}) SwitchFaultLevelMap = map[int64]int{generalFaultCode: NotHandleFaultLevel} fault := GetSwitchFaultInfo() convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) diff --git a/pkg/common/proto.go b/pkg/common/proto.go index c30d56f1..181ad86b 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -38,6 +38,32 @@ type NodeDeviceInfoCache struct { CheckCode string } +// SwitchFaultEvent is the struct for switch reported fault +type SwitchFaultEvent struct { + EventType uint + // SubType fault subtype used for id a fault + SubType uint + // FaultType To maintain compatibility with next incoming switch fault codes changes + // this filed is reserved for next version + // in this version this field is obtained through manually mapping in device-plugin + // next version this filed will be reported by switch driver + FaultType uint + // PeerPortDevice used to tell what kind of device connected to + PeerPortDevice uint + PeerPortId uint + SwitchChipId uint + SwitchPortId uint + // Severity used to tell how serious is the fault + Severity uint + // Assertion tell what kind of fault, recover, happen or once + Assertion uint + EventSerialNum int + NotifySerialNum int + AlarmRaisedTime int64 + AdditionalParam string + AdditionalInfo string +} + // SwitchFaultInfo Switch Fault Info type SwitchFaultInfo struct { FaultCode []string diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 7763fa81..6c2e73c4 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -100,27 +100,6 @@ const ( subTypeBase = 1000 ) -// SwitchFaultEvent is the struct for switch reported fault -type SwitchFaultEvent struct { - EventType uint - // SubType fault subtype used for id a fault - SubType uint - // PeerPortDevice used to tell what kind of device connected to - PeerPortDevice uint - PeerPortId uint - SwitchChipId uint - SwitchPortId uint - // Severity used to tell how serious is the fault - Severity uint - // Assertion tell what kind of fault, recover, happen or once - Assertion uint - EventSerialNum int - NotifySerialNum int - AlarmRaisedTime int64 - AdditionalParam string - AdditionalInfo string -} - // SwitchDevManager is the manager for switch type SwitchDevManager struct { } @@ -129,7 +108,9 @@ var ( switchInitOnce sync.Once // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, // this kind of fault code will be faultcode * 1000 + PeerPortDevice - duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} + duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} + EventTypeToFaultTypeMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, + 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} ) // UpdateSwitchFaultLevel update the map recording fault code and it's level, as long as deviceinfo changed @@ -188,20 +169,20 @@ func goFaultEventHandler(event *C.struct_LqDcmiEvent) { // faultEventHandler callback function for subscribe mod, witch will receive fault code when fault happens faultEvent := convertFaultEvent(event) hwlog.RunLog.Warnf("switch subscribe got fault:%#v, hex:%v", faultEvent, - fmt.Sprintf("%08x", faultEvent.SubType)) + fmt.Sprintf("%08x", faultEvent.FaultType)) // for recovered fault, delete them from current fault codes if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { - newFaultCodes := make([]int64, 0) - for _, code := range common.GetSwitchFaultCode() { - if code != int64(faultEvent.SubType) { - newFaultCodes = append(newFaultCodes, code) + newFaultCodes := make([]common.SwitchFaultEvent, 0) + for _, errInfo := range common.GetSwitchFaultCode() { + if int64(errInfo.FaultType) != int64(faultEvent.FaultType) { + newFaultCodes = append(newFaultCodes, errInfo) } } common.SetSwitchFaultCode(newFaultCodes) return } currentFault := common.GetSwitchFaultCode() - common.SetSwitchFaultCode(append(currentFault, int64(faultEvent.SubType))) + common.SetSwitchFaultCode(append(currentFault, faultEvent)) } // GetSwitchFaultCodeByInterval start a none stop loop to query and update switch fault code @@ -244,37 +225,42 @@ func (sdm *SwitchDevManager) SubscribeSwitchFaults() error { } // GetSwitchFaults will try to get all fault -func GetSwitchFaults() ([]int64, error) { +func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { var errCount C.uint var errInfoArray [maxFaultNum]C.struct_LqDcmiEvent if retCode := C.lq_dcmi_get_fault_info(C.uint(maxFaultNum), &errCount, &errInfoArray[0]); int32(retCode) != devmanagercommon.Success { - return []int64{}, fmt.Errorf("failed to get switch device errorcodes, errCode:%v", retCode) + return []common.SwitchFaultEvent{}, fmt.Errorf("failed to get switch device errorcodes, errCode:%v", retCode) } if int32(errCount) < 0 || int32(errCount) > maxFaultNum { - return []int64{}, fmt.Errorf("failed to get switch device errcodes, cause errcodes nums %d is illegal", errCount) + return []common.SwitchFaultEvent{}, fmt.Errorf("failed to get switch device errcodes, "+ + "cause errcodes nums %v is illegal", errCount) } - retErrores := make([]int64, 0) + errorCodes := make([]int64, 0) + retErrorInfo := make([]common.SwitchFaultEvent, 0) for i := 0; i < len(errInfoArray); i++ { faultEvent := convertFaultEvent(&errInfoArray[i]) - if faultEvent.SubType == 0 { + if faultEvent.FaultType == 0 { continue } if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { continue } - retErrores = append(retErrores, int64(faultEvent.SubType)) + errorCodes = append(errorCodes, int64(faultEvent.FaultType)) + retErrorInfo = append(retErrorInfo, faultEvent) } - hwlog.RunLog.Warnf("get fault:%#v", retErrores) - return retErrores, nil + hwlog.RunLog.Warnf("get fault codes: %#v", errorCodes) + return retErrorInfo, nil } // convertFaultEvent convert event getting from driver to go struct -func convertFaultEvent(event *C.struct_LqDcmiEvent) SwitchFaultEvent { - fault := SwitchFaultEvent{ - EventType: uint(event.eventType), - SubType: uint(event.subType), +func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { + fault := common.SwitchFaultEvent{ + EventType: uint(event.eventType), + SubType: uint(event.subType), + // in this version, this filed is always 0 because C struct doesn't have such filed + FaultType: uint(0), PeerPortDevice: uint(event.peerportDevice), PeerPortId: uint(event.peerportId), SwitchChipId: uint(event.switchChipid), @@ -285,13 +271,8 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) SwitchFaultEvent { NotifySerialNum: int(event.notifySerialNum), AlarmRaisedTime: int64(event.alarmRaisedTime), } - fault.SubType = getEventType(fault) - return fault -} - -func getEventType(event SwitchFaultEvent) uint { - if duplicate, ok := duplicateSubEventMap[int(event.SubType)]; ok && duplicate { - return event.SubType*subTypeBase + event.PeerPortDevice + if event.FaultType == 0 { + event.FaultType = EventTypeToFaultTypeMapper[event.EventType] } - return event.SubType + return fault } -- Gitee From e23e6c4590e2c429a580c8e87a1c4b168108b827 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Tue, 24 Sep 2024 15:07:05 +0800 Subject: [PATCH 02/31] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E5=90=8C=E6=97=B6=E4=B8=8A=E6=8A=A5=E6=95=85?= =?UTF-8?q?=E9=9A=9C=E8=AF=A6=E7=BB=86=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 6c2e73c4..1565ca6f 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -260,7 +260,7 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { EventType: uint(event.eventType), SubType: uint(event.subType), // in this version, this filed is always 0 because C struct doesn't have such filed - FaultType: uint(0), + FaultType: EventTypeToFaultTypeMapper[event.EventType], PeerPortDevice: uint(event.peerportDevice), PeerPortId: uint(event.peerportId), SwitchChipId: uint(event.switchChipid), @@ -271,8 +271,5 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { NotifySerialNum: int(event.notifySerialNum), AlarmRaisedTime: int64(event.alarmRaisedTime), } - if event.FaultType == 0 { - event.FaultType = EventTypeToFaultTypeMapper[event.EventType] - } return fault } -- Gitee From 7f048aaeca77edce70713d1f4795167129a855f8 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Tue, 24 Sep 2024 15:09:07 +0800 Subject: [PATCH 03/31] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E5=90=8C=E6=97=B6=E4=B8=8A=E6=8A=A5=E6=95=85?= =?UTF-8?q?=E9=9A=9C=E8=AF=A6=E7=BB=86=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 1565ca6f..fe715038 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -260,7 +260,7 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { EventType: uint(event.eventType), SubType: uint(event.subType), // in this version, this filed is always 0 because C struct doesn't have such filed - FaultType: EventTypeToFaultTypeMapper[event.EventType], + FaultType: EventTypeToFaultTypeMapper[event.eventType], PeerPortDevice: uint(event.peerportDevice), PeerPortId: uint(event.peerportId), SwitchChipId: uint(event.switchChipid), -- Gitee From 3a4e2ecdbc38fee1d880e7e5f615399afc981b5d Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Tue, 24 Sep 2024 15:12:16 +0800 Subject: [PATCH 04/31] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E5=90=8C=E6=97=B6=E4=B8=8A=E6=8A=A5=E6=95=85?= =?UTF-8?q?=E9=9A=9C=E8=AF=A6=E7=BB=86=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index fe715038..c559c885 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -260,7 +260,7 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { EventType: uint(event.eventType), SubType: uint(event.subType), // in this version, this filed is always 0 because C struct doesn't have such filed - FaultType: EventTypeToFaultTypeMapper[event.eventType], + FaultType: EventTypeToFaultTypeMapper[uint(event.eventType)], PeerPortDevice: uint(event.peerportDevice), PeerPortId: uint(event.peerportId), SwitchChipId: uint(event.switchChipid), -- Gitee From 4ec88db3ee74246cfe51729f13a965e622486d16 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Thu, 26 Sep 2024 10:48:56 +0800 Subject: [PATCH 05/31] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E5=90=8C=E6=97=B6=E4=B8=8A=E6=8A=A5=E6=95=85?= =?UTF-8?q?=E9=9A=9C=E8=AF=A6=E7=BB=86=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/device.go | 49 ++++++++++++++---------- pkg/common/fault_code.go | 9 +++-- pkg/device/deviceswitch/ascend_switch.go | 2 +- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/pkg/common/device.go b/pkg/common/device.go index db660535..0511756c 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -47,7 +47,7 @@ const ( var ( // SwitchFaultLevelMapLock Lock SwitchFaultLevelMap to avoid concurrence write and read SwitchFaultLevelMapLock sync.Mutex - // SwitchFaultLevelMap record every fault code and it's level + // SwitchFaultLevelMap record every switch fault code and it's level SwitchFaultLevelMap = make(map[int64]int, GeneralMapSize) // SwitchFaultLock is used for CurrentSwitchFault which may be used concurrence SwitchFaultLock sync.Mutex @@ -94,32 +94,16 @@ func GetSwitchFaultInfo() SwitchFaultInfo { if ParamOption.RealCardType != common.Ascend910A3 || !ParamOption.EnableSwitchFault { return SwitchFaultInfo{} } - maxFaultLevel := 0 + SwitchFaultLock.Lock() defer SwitchFaultLock.Unlock() SwitchFaultLevelMapLock.Lock() - for _, code := range currentSwitchFault { - level := SwitchFaultLevelMap[int64(code.FaultType)] - if level > maxFaultLevel { - maxFaultLevel = level - } - } - SwitchFaultLevelMapLock.Unlock() - faultLevel, NodeStatus := NotHandleFaultLevelStr, "Healthy" - switch maxFaultLevel { - case NotHandleFaultLevel: - faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" - case PreSeparateFaultLevel: - faultLevel, NodeStatus = PreSeparateFaultLevelStr, "SubHealthy" - case SeparateFaultLevel: - faultLevel, NodeStatus = SeparateFaultLevelStr, "UnHealthy" - default: - faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" - } - // keep those none zero codes + + faultLevel, NodeStatus := getSwitchFaultLevelAndNodeStatus() reportFaultCodes := make([]string, 0) for _, faultInfo := range currentSwitchFault { + // keep those none zero codes if faultInfo.FaultType != 0 { faultStr, err := getSimpleSwitchFaultStr(faultInfo) if err != nil { @@ -136,6 +120,29 @@ func GetSwitchFaultInfo() SwitchFaultInfo { } } +func getSwitchFaultLevelAndNodeStatus() (faultLevel string, NodeStatus string) { + maxFaultLevel := 0 + for _, code := range currentSwitchFault { + level := SwitchFaultLevelMap[int64(code.FaultType)] + if level > maxFaultLevel { + maxFaultLevel = level + } + } + SwitchFaultLevelMapLock.Unlock() + faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" + switch maxFaultLevel { + case NotHandleFaultLevel: + faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" + case PreSeparateFaultLevel: + faultLevel, NodeStatus = PreSeparateFaultLevelStr, "SubHealthy" + case SeparateFaultLevel: + faultLevel, NodeStatus = SeparateFaultLevelStr, "UnHealthy" + default: + faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" + } + return faultLevel, NodeStatus +} + // getSimpleSwitchFaultStr convert fault info to string to display in switch fault info configmap func getSimpleSwitchFaultStr(faultInfo SwitchFaultEvent) (string, error) { // todo 确定哪些字段是可以展示的 diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index de6a4443..27c2093b 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -496,11 +496,12 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) SeparateFaultCodes = make([]int64, 0, GeneralMapSize) - + invalidFormatInfo := "failed to parse NotHandleFaultCodes faultcode:%v, will ignore it," + + " please check if it is a hex value" for _, code := range switchFileInfo.NotHandleFaultCodes { codeInt64, err := strconv.ParseInt(code, Hex, BitSize) if err != nil { - hwlog.RunLog.Warnf("failed to parse NotHandleFaultCodes faultcode:%v", code) + hwlog.RunLog.Warnf(invalidFormatInfo, code) continue } NotHandleFaultCodes = append(NotHandleFaultCodes, codeInt64) @@ -510,7 +511,7 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { for _, code := range switchFileInfo.ReportFaultCodes { codeInt64, err := strconv.ParseInt(code, Hex, BitSize) if err != nil { - hwlog.RunLog.Warnf("failed to parse PreSeparateFaultCodes:%v", code) + hwlog.RunLog.Warnf(invalidFormatInfo, code) continue } PreSeparateFaultCodes = append(PreSeparateFaultCodes, codeInt64) @@ -520,7 +521,7 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { for _, code := range switchFileInfo.SeparateFaultCodes { codeInt64, err := strconv.ParseInt(code, Hex, BitSize) if err != nil { - hwlog.RunLog.Warnf("failed to parse SeparateFaultCodes:%v", code) + hwlog.RunLog.Warnf(invalidFormatInfo, code) continue } SeparateFaultCodes = append(SeparateFaultCodes, codeInt64) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index c559c885..370955be 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -250,7 +250,7 @@ func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { errorCodes = append(errorCodes, int64(faultEvent.FaultType)) retErrorInfo = append(retErrorInfo, faultEvent) } - hwlog.RunLog.Warnf("get fault codes: %#v", errorCodes) + hwlog.RunLog.Warnf("switch of 910A3 get fault codes: %#v", errorCodes) return retErrorInfo, nil } -- Gitee From c7621e477b4161c519a551ae86d42bf65085aab9 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Thu, 26 Sep 2024 12:35:28 +0000 Subject: [PATCH 06/31] =?UTF-8?q?=E5=90=8C=E6=AD=A5=E5=9B=9B=E5=85=83?= =?UTF-8?q?=E7=BB=84=E6=95=85=E9=9A=9C=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lianjun Zhang Atlas --- build/SwitchFaultCode.json | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index ed3d1d2c..65522ca1 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -1,19 +1,19 @@ { - "NotHandleFaultCodes":["0000002d"], + "NotHandleFaultCodes":[ + "[,155920,na,na]","[,155921,na,na]","[0x00f103b6,155909,na,na]","[0x00f1ff06,155910,na,na]", + "[0x00f1ff09,155913,na,na]" + ], "ReportFaultCodes": [ - "00000002", "00000003", "0000520a", "00000055", "00000056", "00000057", "00000058", "00000059", "0000005a", - "0000005b", "0000005c", "000001be"], + "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]","[0x00f1ff09,155914,L2,na]" + ], "SubHealthFaultCodes": [ - "00000008", "00002712", "00002afa", "0000000c", "0000000d", "0000000f", "00000010", "00000012", "000055f2", - "000059da", "000061aa", "000000d1", "000000d2", "000000d3", "000000d4", "000000d5", "000000d6", "000000d7", - "000000d8", "000000f3", "000000f4", "000000f5", "000000f6", "00000113", "00000114", "00000115", "00000140", - "000001c0", "000001c1","00001f42"], + "[,155922,L2,na]","[,155923,na,na]","[0x00f103b6,155908,na,na]","[0x00f1ff09,155912,L2,na]" + ], "ResetFaultCodes": [ - "0000001b", "0000001c", "00002710", "00002af8", "00002711", "00002af9", "0000002e", "0000003a", "00000025", - "0000003b", "0000003c", "0000003f", "00000040", "00005208", "00005209", "000055f0", "000059d8", "000055f1", - "000059d9", "000061a8", "000061a9", "000000cf", "000000d0", "000000f9", "000000fa", "0000014a", "00000151", - "00000148", "00000007", "00000004", "00000005", "00000006"], + "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f103b0,155907,na,na]", "[0x00f1ff09,155912,cpu,na]", + "[0x00f1ff09,155912,npu,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,npu,na]" + ], "SeparateFaultCodes": [ - "000000de", "000000ff", "00000116", "00000117", "00000149", "0000014b", "0000014c", "0000014d", "0000014e", - "0000014f", "00001f40", "00001f41"] + "[,155922,cpu,na]","[,155922,npu,na]","[0x00f103b0,155907,na,na]", "[0x00f1ff09,155912,na,na]" + ] } \ No newline at end of file -- Gitee From f20851a5df4be762aaee969b2ba8b3270869c6e6 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Thu, 26 Sep 2024 12:48:21 +0000 Subject: [PATCH 07/31] =?UTF-8?q?alarmId=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lianjun Zhang Atlas --- pkg/device/deviceswitch/ascend_switch.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 370955be..68936031 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -108,9 +108,13 @@ var ( switchInitOnce sync.Once // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, // this kind of fault code will be faultcode * 1000 + PeerPortDevice - duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} + duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} + // EventTypeToFaultTypeMapper todo 14 15 需要单独确认故障码中的告警id和故障id形式 EventTypeToFaultTypeMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} + // EventTypeToAlarmIdMapper todo 14 15 需要确定alarmID,5中需要按checkid单独适配 + EventTypeToAlarmIdMapper = map[uint]string{13: "0x00f103b0", 12: "0x00f103b0", 11: "0x00f103b0", 10: "0x00f1ff06", + 14: "OID(1.3.6.1.4.1.2011.381.4.1.1)", 15: "OID(1.3.6.1.4.1.2011.381.4.1.1)", ) // UpdateSwitchFaultLevel update the map recording fault code and it's level, as long as deviceinfo changed -- Gitee From 0967dceb7a65300736d88af1cd5cb3fc989e8e4a Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Fri, 27 Sep 2024 13:17:17 +0800 Subject: [PATCH 08/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 4 ++ pkg/common/device.go | 42 ++++++------- pkg/common/device_test.go | 14 ++--- pkg/common/fault_code.go | 45 ++++++++------ pkg/common/proto.go | 5 +- pkg/device/deviceswitch/ascend_switch.go | 75 ++++++++++++++++++++---- 6 files changed, 125 insertions(+), 60 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index c8031061..f478db6d 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -644,6 +644,10 @@ const ( PollFaultCodeCMMinInterval = 30 // GetSwitchFaultCodeInterval is the interval(second) of get all fault code by get interface GetSwitchFaultCodeInterval = 300 + // MinLengthOfFaultCode [0x00f103b0,155904,na,na] must contain at least "[" "]" + MinLengthOfFaultCode = 2 + // PartNumOfFaultCode [0x00f103b0,155904,na,na] must have 4 parts + PartNumOfFaultCode = 4 // FaultCodeCMName is the name of the configmap that is used to save fault code FaultCodeCMName = "mindx-dl-fault-config" // FaultCodeCMNameSpace is the namespace of the fault code configmap diff --git a/pkg/common/device.go b/pkg/common/device.go index 0511756c..4276cc10 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -48,7 +48,7 @@ var ( // SwitchFaultLevelMapLock Lock SwitchFaultLevelMap to avoid concurrence write and read SwitchFaultLevelMapLock sync.Mutex // SwitchFaultLevelMap record every switch fault code and it's level - SwitchFaultLevelMap = make(map[int64]int, GeneralMapSize) + SwitchFaultLevelMap = make(map[string]int, GeneralMapSize) // SwitchFaultLock is used for CurrentSwitchFault which may be used concurrence SwitchFaultLock sync.Mutex // CurrentSwitchFault store all switch fault which will be reported to device-info configmap @@ -104,7 +104,7 @@ func GetSwitchFaultInfo() SwitchFaultInfo { reportFaultCodes := make([]string, 0) for _, faultInfo := range currentSwitchFault { // keep those none zero codes - if faultInfo.FaultType != 0 { + if faultInfo.EventType != 0 { faultStr, err := getSimpleSwitchFaultStr(faultInfo) if err != nil { continue @@ -123,7 +123,7 @@ func GetSwitchFaultInfo() SwitchFaultInfo { func getSwitchFaultLevelAndNodeStatus() (faultLevel string, NodeStatus string) { maxFaultLevel := 0 for _, code := range currentSwitchFault { - level := SwitchFaultLevelMap[int64(code.FaultType)] + level := SwitchFaultLevelMap[code.AssembledFaultCode] if level > maxFaultLevel { maxFaultLevel = level } @@ -147,27 +147,27 @@ func getSwitchFaultLevelAndNodeStatus() (faultLevel string, NodeStatus string) { func getSimpleSwitchFaultStr(faultInfo SwitchFaultEvent) (string, error) { // todo 确定哪些字段是可以展示的 type simpleSwitchFaultInfo struct { - EventType uint - FaultType string - PeerPortDevice uint - PeerPortId uint - SwitchChipId uint - SwitchPortId uint - Severity uint - Assertion uint - AlarmRaisedTime int64 + EventType uint + AssembledFaultCode string + PeerPortDevice uint + PeerPortId uint + SwitchChipId uint + SwitchPortId uint + Severity uint + Assertion uint + AlarmRaisedTime int64 } bytes, err := json.Marshal(simpleSwitchFaultInfo{ - EventType: faultInfo.EventType, - FaultType: fmt.Sprintf("%08x", faultInfo.FaultType), - PeerPortDevice: faultInfo.PeerPortDevice, - PeerPortId: faultInfo.PeerPortId, - SwitchChipId: faultInfo.SwitchChipId, - SwitchPortId: faultInfo.SwitchPortId, - Severity: faultInfo.Severity, - Assertion: faultInfo.Assertion, - AlarmRaisedTime: faultInfo.AlarmRaisedTime, + EventType: faultInfo.EventType, + AssembledFaultCode: faultInfo.AssembledFaultCode, + PeerPortDevice: faultInfo.PeerPortDevice, + PeerPortId: faultInfo.PeerPortId, + SwitchChipId: faultInfo.SwitchChipId, + SwitchPortId: faultInfo.SwitchPortId, + Severity: faultInfo.Severity, + Assertion: faultInfo.Assertion, + AlarmRaisedTime: faultInfo.AlarmRaisedTime, }) if err != nil { hwlog.RunLog.Warnf("failed to convert fault:%#v, err: %v", faultInfo, err) diff --git a/pkg/common/device_test.go b/pkg/common/device_test.go index ed5811c2..68050e94 100644 --- a/pkg/common/device_test.go +++ b/pkg/common/device_test.go @@ -28,7 +28,7 @@ import ( const ( cardNum = 2 - generalFaultCode = 100 + generalFaultCode = "[0x00f103b0,155649,na,NoneExist]" firstFaultIdx = 0 ) @@ -207,23 +207,23 @@ func TestGetSwitchFaultInfo(t *testing.T) { ParamOption.RealCardType = common.Ascend910A3 ParamOption.EnableSwitchFault = true currentSwitchFault = []SwitchFaultEvent{} - SwitchFaultLevelMap = map[int64]int{} + SwitchFaultLevelMap = map[string]int{} convey.Convey("test empty SwitchFaultLevelMap", func() { - currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{FaultType: generalFaultCode}) + currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{AssembledFaultCode: generalFaultCode}) fault := GetSwitchFaultInfo() convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) }) convey.Convey("test actually level", func() { - currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{FaultType: generalFaultCode}) - SwitchFaultLevelMap = map[int64]int{generalFaultCode: NotHandleFaultLevel} + currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{AssembledFaultCode: generalFaultCode}) + SwitchFaultLevelMap = map[string]int{generalFaultCode: NotHandleFaultLevel} fault := GetSwitchFaultInfo() convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) - SwitchFaultLevelMap = map[int64]int{generalFaultCode: PreSeparateFaultLevel} + SwitchFaultLevelMap = map[string]int{generalFaultCode: PreSeparateFaultLevel} fault = GetSwitchFaultInfo() convey.So(fault.FaultLevel == PreSeparateFaultLevelStr, convey.ShouldBeTrue) - SwitchFaultLevelMap = map[int64]int{generalFaultCode: SeparateFaultLevel} + SwitchFaultLevelMap = map[string]int{generalFaultCode: SeparateFaultLevel} fault = GetSwitchFaultInfo() convey.So(fault.FaultLevel == SeparateFaultLevelStr, convey.ShouldBeTrue) }) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 27c2093b..06290950 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -86,11 +86,11 @@ const ( var ( faultTypeCode FaultTypeCode // NotHandleFaultCodes contains all fault code that believed to be not handled, in this case is L1 - NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) + NotHandleFaultCodes = make([]string, 0, GeneralMapSize) // PreSeparateFaultCodes contains all fault code that believed to be PreSeparate, in this case is L2-L3 - PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) + PreSeparateFaultCodes = make([]string, 0, GeneralMapSize) // SeparateFaultCodes contains all fault code that believed to be Separate, in this case is L4-L5 - SeparateFaultCodes = make([]int64, 0, GeneralMapSize) + SeparateFaultCodes = make([]string, 0, GeneralMapSize) // initLogicIDs need init fault code device. add by train or inference initLogicIDs []int32 // logicIDLock operate initLogicIDs lock @@ -493,43 +493,52 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { return fmt.Errorf("failed to unmarsha switch fault code, err: %s", err.Error()) } - NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) - PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) - SeparateFaultCodes = make([]int64, 0, GeneralMapSize) - invalidFormatInfo := "failed to parse NotHandleFaultCodes faultcode:%v, will ignore it," + - " please check if it is a hex value" + NotHandleFaultCodes = make([]string, 0, GeneralMapSize) + PreSeparateFaultCodes = make([]string, 0, GeneralMapSize) + SeparateFaultCodes = make([]string, 0, GeneralMapSize) + invalidFormatInfo := "failed to parse NotHandleFaultCodes faultCode:%v, will ignore it," + + " please check if its format, such as: [0x00f1ff09,155914,cpu,na]" for _, code := range switchFileInfo.NotHandleFaultCodes { - codeInt64, err := strconv.ParseInt(code, Hex, BitSize) - if err != nil { + if !isValidSwitchFaultCode(code) { hwlog.RunLog.Warnf(invalidFormatInfo, code) continue } - NotHandleFaultCodes = append(NotHandleFaultCodes, codeInt64) + NotHandleFaultCodes = append(NotHandleFaultCodes, code) } switchFileInfo.ReportFaultCodes = append(switchFileInfo.ReportFaultCodes, switchFileInfo.SubHealthFaultCodes...) for _, code := range switchFileInfo.ReportFaultCodes { - codeInt64, err := strconv.ParseInt(code, Hex, BitSize) - if err != nil { + if !isValidSwitchFaultCode(code) { hwlog.RunLog.Warnf(invalidFormatInfo, code) continue } - PreSeparateFaultCodes = append(PreSeparateFaultCodes, codeInt64) + PreSeparateFaultCodes = append(PreSeparateFaultCodes, code) } switchFileInfo.SeparateFaultCodes = append(switchFileInfo.SeparateFaultCodes, switchFileInfo.ResetFaultCodes...) for _, code := range switchFileInfo.SeparateFaultCodes { - codeInt64, err := strconv.ParseInt(code, Hex, BitSize) - if err != nil { + if !isValidSwitchFaultCode(code) { hwlog.RunLog.Warnf(invalidFormatInfo, code) continue } - SeparateFaultCodes = append(SeparateFaultCodes, codeInt64) + SeparateFaultCodes = append(SeparateFaultCodes, code) } return nil } +// isValidSwitchFaultCode to judge is a fault code is valid format as [0x00f1ff09,155914,cpu,na] +func isValidSwitchFaultCode(code string) bool { + if len(code) <= MinLengthOfFaultCode { + return false + } + if !strings.HasPrefix(code, "[") || strings.HasSuffix(code, "]") { + return false + } + parts := strings.Split(code, CommaSepDev) + return len(parts) == PartNumOfFaultCode +} + func loadFaultDurationCustomization(customization []FaultDurationCustomization) { handledEventId := make(sets.String, common.MaxErrorCodeCount) for _, cus := range customization { @@ -1270,7 +1279,7 @@ func collectEachFaultEvent(logicId int32, faultInfos []common.DevFaultInfo) { } if faultDurationMap[eventIdStr].Duration == nil { - faultDurationMap[eventIdStr].Duration = make(map[int32]FaultDurationData, 0) + faultDurationMap[eventIdStr].Duration = make(map[int32]FaultDurationData, GeneralMapSize) } if _, ok := faultDurationMap[eventIdStr].Duration[logicId]; !ok { diff --git a/pkg/common/proto.go b/pkg/common/proto.go index 181ad86b..36e59cc7 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -47,7 +47,10 @@ type SwitchFaultEvent struct { // this filed is reserved for next version // in this version this field is obtained through manually mapping in device-plugin // next version this filed will be reported by switch driver - FaultType uint + FaultID uint + // AssembledFaultCode is to assemble cgo lq.struct_LqDcmiEvent to a device-plugin recognized fault code type + // such as : [0x00f103b0,155904,na,na] in config file: SwitchFaultCode + AssembledFaultCode string // PeerPortDevice used to tell what kind of device connected to PeerPortDevice uint PeerPortId uint diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 68936031..7b9cdf11 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -109,19 +109,19 @@ var ( // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, // this kind of fault code will be faultcode * 1000 + PeerPortDevice duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} - // EventTypeToFaultTypeMapper todo 14 15 需要单独确认故障码中的告警id和故障id形式 - EventTypeToFaultTypeMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, + // EventTypeToFaultIDMapper todo 14 15 需要单独确认故障码中的告警id和故障id形式 + EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} // EventTypeToAlarmIdMapper todo 14 15 需要确定alarmID,5中需要按checkid单独适配 EventTypeToAlarmIdMapper = map[uint]string{13: "0x00f103b0", 12: "0x00f103b0", 11: "0x00f103b0", 10: "0x00f1ff06", - 14: "OID(1.3.6.1.4.1.2011.381.4.1.1)", 15: "OID(1.3.6.1.4.1.2011.381.4.1.1)", + 14: "OID(1.3.6.1.4.1.2011.381.4.1.1)", 15: "OID(1.3.6.1.4.1.2011.381.4.1.1)"} ) // UpdateSwitchFaultLevel update the map recording fault code and it's level, as long as deviceinfo changed func UpdateSwitchFaultLevel() { common.SwitchFaultLevelMapLock.Lock() defer common.SwitchFaultLevelMapLock.Unlock() - common.SwitchFaultLevelMap = make(map[int64]int, common.GeneralMapSize) + common.SwitchFaultLevelMap = make(map[string]int, common.GeneralMapSize) for _, code := range common.NotHandleFaultCodes { common.SwitchFaultLevelMap[code] = common.NotHandleFaultLevel } @@ -173,12 +173,14 @@ func goFaultEventHandler(event *C.struct_LqDcmiEvent) { // faultEventHandler callback function for subscribe mod, witch will receive fault code when fault happens faultEvent := convertFaultEvent(event) hwlog.RunLog.Warnf("switch subscribe got fault:%#v, hex:%v", faultEvent, - fmt.Sprintf("%08x", faultEvent.FaultType)) + fmt.Sprintf("%08x", faultEvent.AssembledFaultCode)) // for recovered fault, delete them from current fault codes if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { newFaultCodes := make([]common.SwitchFaultEvent, 0) for _, errInfo := range common.GetSwitchFaultCode() { - if int64(errInfo.FaultType) != int64(faultEvent.FaultType) { + // todo 验证故障恢复场景 + // only in faultevent and recoverEvent major info is the same it will be thought as recover + if !isFaultRecoveredEvent(errInfo, faultEvent) { newFaultCodes = append(newFaultCodes, errInfo) } } @@ -241,17 +243,17 @@ func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { "cause errcodes nums %v is illegal", errCount) } - errorCodes := make([]int64, 0) + errorCodes := make([]string, 0) retErrorInfo := make([]common.SwitchFaultEvent, 0) for i := 0; i < len(errInfoArray); i++ { faultEvent := convertFaultEvent(&errInfoArray[i]) - if faultEvent.FaultType == 0 { + if faultEvent.EventType == 0 { continue } if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { continue } - errorCodes = append(errorCodes, int64(faultEvent.FaultType)) + errorCodes = append(errorCodes, faultEvent.AssembledFaultCode) retErrorInfo = append(retErrorInfo, faultEvent) } hwlog.RunLog.Warnf("switch of 910A3 get fault codes: %#v", errorCodes) @@ -261,10 +263,8 @@ func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { // convertFaultEvent convert event getting from driver to go struct func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { fault := common.SwitchFaultEvent{ - EventType: uint(event.eventType), - SubType: uint(event.subType), - // in this version, this filed is always 0 because C struct doesn't have such filed - FaultType: EventTypeToFaultTypeMapper[uint(event.eventType)], + EventType: uint(event.eventType), + SubType: uint(event.subType), PeerPortDevice: uint(event.peerportDevice), PeerPortId: uint(event.peerportId), SwitchChipId: uint(event.switchChipid), @@ -275,5 +275,54 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { NotifySerialNum: int(event.notifySerialNum), AlarmRaisedTime: int64(event.alarmRaisedTime), } + // todo 验证这种写法能够改掉 + if err := setExtraFaultInfo(&fault); err != nil { + hwlog.RunLog.Error(err) + } return fault } + +// getAssembledFaultCode to convert fault event struct to a standard fault code as [0x00f1ff09,155912,cpu,na] +func setExtraFaultInfo(event *common.SwitchFaultEvent) error { + faultID, ok := EventTypeToFaultIDMapper[event.EventType] + if !ok { + hwlog.RunLog.Warnf("failed to find faultID for eventType: %d", event.EventType) + } + alarmID, ok := EventTypeToAlarmIdMapper[event.EventType] + if !ok { + hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) + } + PeerDeviceType := int(event.PeerPortDevice) + PeerDeviceName := "" + switch PeerDeviceType { + // todo 提常量 + case 0: + // todo 当前0值对应CPU和芯片自身故障 需要确认 + PeerDeviceName = "cpu" + case 1: + PeerDeviceName = "npu" + case 2: + PeerDeviceName = "L2" + } + // currently the last part means ports, remain na for now + event.AssembledFaultCode = fmt.Sprintf("[%s,%d,%s,na]", alarmID, faultID, PeerDeviceName) + event.FaultID = faultID + return nil +} + +// isFaultRecoveredEvent to judge if recoverEvent is the recover event for faultEvent 的故障恢复消息 +// todo 确定这种方式是真的恢复事件 与 故障发生事件一致 +func isFaultRecoveredEvent(faultEvent, recoverEvent common.SwitchFaultEvent) bool { + if int8(recoverEvent.Assertion) != devmanagercommon.FaultRecover || recoverEvent.Assertion == faultEvent.Assertion { + return false + } + faultEventInfo := fmt.Sprintf("EventType:%v,SubType:%v,FaultID:%v,AssembledFaultCode:%v,"+ + "PeerPortDevice:%v,PeerPortId:%v,SwitchChipId:%v,SwitchPortId:%v", faultEvent.EventType, faultEvent.SubType, + faultEvent.FaultID, faultEvent.AssembledFaultCode, faultEvent.PeerPortDevice, faultEvent.PeerPortId, + faultEvent.SwitchChipId, faultEvent.SwitchPortId) + recoveredEventInfo := fmt.Sprintf("EventType:%v,SubType:%v,FaultID:%v,AssembledFaultCode:%v,"+ + "PeerPortDevice:%v,PeerPortId:%v,SwitchChipId:%v,SwitchPortId:%v", recoverEvent.EventType, recoverEvent.SubType, + recoverEvent.FaultID, recoverEvent.AssembledFaultCode, recoverEvent.PeerPortDevice, recoverEvent.PeerPortId, + recoverEvent.SwitchChipId, recoverEvent.SwitchPortId) + return faultEventInfo == recoveredEventInfo +} -- Gitee From c5e2227620329e910e23aec3e92d7343e03ea70d Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Fri, 27 Sep 2024 16:11:47 +0800 Subject: [PATCH 09/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/fault_code.go | 2 +- pkg/common/fault_code_test.go | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 06290950..27aa591b 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -532,7 +532,7 @@ func isValidSwitchFaultCode(code string) bool { if len(code) <= MinLengthOfFaultCode { return false } - if !strings.HasPrefix(code, "[") || strings.HasSuffix(code, "]") { + if !strings.HasPrefix(code, "[") || !strings.HasSuffix(code, "]") { return false } parts := strings.Split(code, CommaSepDev) diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index 28507d39..d0dfdac5 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -18,7 +18,6 @@ package common import ( "encoding/json" "errors" - "fmt" "strconv" "strings" "testing" @@ -1563,7 +1562,7 @@ func TestGetTimeoutFaultCodes(t *testing.T) { func TestLoadSwitchFaultCode(t *testing.T) { convey.Convey("test LoadSwitchFaultCode", t, func() { switchFileInfo := SwitchFaultFileInfo{ - NotHandleFaultCodes: []string{fmt.Sprintf("%08x", generalFaultCode)}, + NotHandleFaultCodes: []string{generalFaultCode}, } bytes, err := json.Marshal(switchFileInfo) convey.So(err, convey.ShouldBeNil) -- Gitee From d434850bc87afeef7f0251bf7b49441ea5bb85e4 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Fri, 27 Sep 2024 18:05:17 +0800 Subject: [PATCH 10/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 49 ++++++++++++++++++++---- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 7b9cdf11..6122257d 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -112,8 +112,13 @@ var ( // EventTypeToFaultIDMapper todo 14 15 需要单独确认故障码中的告警id和故障id形式 EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} - // EventTypeToAlarmIdMapper todo 14 15 需要确定alarmID,5中需要按checkid单独适配 - EventTypeToAlarmIdMapper = map[uint]string{13: "0x00f103b0", 12: "0x00f103b0", 11: "0x00f103b0", 10: "0x00f1ff06", + // FaultIdToAlarmIdMapper todo 14 15 5/8 需要确定alarmID,5中需要按checkid单独适配 + FaultIdToAlarmIdMapper = map[uint]string{ + 155907: "0x00f103b0", 155649: "0x00f103b0", 155904: "0x00f103b0", + 132134: "0x00f1ff06", 155910: "0x00f1ff06", 155911: "0x00f1ff06", + 155908: "0x00f103b6", 155909: "0x00f103b6", + 155912: "0x00f1ff09", 155913: "0x00f1ff09", 155914: "0x00f1ff09", + 132332: "0x00f10509", 132333: "0x00f10509", 14: "OID(1.3.6.1.4.1.2011.381.4.1.1)", 15: "OID(1.3.6.1.4.1.2011.381.4.1.1)"} ) @@ -282,13 +287,27 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { return fault } -// getAssembledFaultCode to convert fault event struct to a standard fault code as [0x00f1ff09,155912,cpu,na] +// setExtraFaultInfo to convert fault event struct to a standard fault code as [0x00f1ff09,155912,cpu,na] func setExtraFaultInfo(event *common.SwitchFaultEvent) error { - faultID, ok := EventTypeToFaultIDMapper[event.EventType] - if !ok { - hwlog.RunLog.Warnf("failed to find faultID for eventType: %d", event.EventType) + faultID, ok := uint(0), false + // todo eventType为5的情况下,需要依赖checkId进行判断,需要确定后续checkId的稳定性 + if event.EventType == 5 { + if event.SubType == 448 { + faultID = uint(132333) + } else if event.SubType == 449 { + faultID = uint(132332) + } else if event.SubType == 8 { + // todo 8的情况下暂时没有对应的故障id,待确认如何处理 + } + } else { + faultID, ok = EventTypeToFaultIDMapper[event.EventType] + if !ok { + hwlog.RunLog.Warnf("failed to find faultID for eventType: %d", event.EventType) + } } - alarmID, ok := EventTypeToAlarmIdMapper[event.EventType] + + alarmID := "" + alarmID, ok = FaultIdToAlarmIdMapper[faultID] if !ok { hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) } @@ -298,7 +317,11 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { // todo 提常量 case 0: // todo 当前0值对应CPU和芯片自身故障 需要确认 - PeerDeviceName = "cpu" + if isPortLevelFault(int(event.EventType)) { + PeerDeviceName = "cpu" + } else { + PeerDeviceName = "na" + } case 1: PeerDeviceName = "npu" case 2: @@ -310,6 +333,16 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { return nil } +// isPortLevelFault to judge if a fault is related to whole chip or any of its ports +// if it is whole chip peerdeviceType will be "na" while peerdeivce==0 +// else it is for its chip, peerdeviceType will be "cpu" while peerdeivce==0 +func isPortLevelFault(eventType int) bool { + if eventType == 3 || eventType == 4 || eventType == 5 || eventType == 14 || eventType == 15 { + return true + } + return false +} + // isFaultRecoveredEvent to judge if recoverEvent is the recover event for faultEvent 的故障恢复消息 // todo 确定这种方式是真的恢复事件 与 故障发生事件一致 func isFaultRecoveredEvent(faultEvent, recoverEvent common.SwitchFaultEvent) bool { -- Gitee From 3c9155396fa4adc2ecfc8d5471810a99657fc14a Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Fri, 27 Sep 2024 18:17:50 +0800 Subject: [PATCH 11/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/device.go | 2 +- pkg/device/deviceswitch/ascend_switch.go | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pkg/common/device.go b/pkg/common/device.go index 4276cc10..53ff46a2 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -102,7 +102,7 @@ func GetSwitchFaultInfo() SwitchFaultInfo { faultLevel, NodeStatus := getSwitchFaultLevelAndNodeStatus() reportFaultCodes := make([]string, 0) - for _, faultInfo := range currentSwitchFault { + for _, faultInfo := range GetSwitchFaultCode() { // keep those none zero codes if faultInfo.EventType != 0 { faultStr, err := getSimpleSwitchFaultStr(faultInfo) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 6122257d..7007f781 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -177,14 +177,13 @@ func (sdm *SwitchDevManager) ShutDownSwitch() { func goFaultEventHandler(event *C.struct_LqDcmiEvent) { // faultEventHandler callback function for subscribe mod, witch will receive fault code when fault happens faultEvent := convertFaultEvent(event) - hwlog.RunLog.Warnf("switch subscribe got fault:%#v, hex:%v", faultEvent, - fmt.Sprintf("%08x", faultEvent.AssembledFaultCode)) + hwlog.RunLog.Warnf("switch subscribe got fault:%#v, faultCode:%v", faultEvent, faultEvent.AssembledFaultCode) // for recovered fault, delete them from current fault codes if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { newFaultCodes := make([]common.SwitchFaultEvent, 0) for _, errInfo := range common.GetSwitchFaultCode() { // todo 验证故障恢复场景 - // only in faultevent and recoverEvent major info is the same it will be thought as recover + // only in faultEvent and recoverEvent major info is the same it will be thought as recover if !isFaultRecoveredEvent(errInfo, faultEvent) { newFaultCodes = append(newFaultCodes, errInfo) } @@ -215,9 +214,7 @@ func (sdm *SwitchDevManager) GetSwitchFaultCodeByInterval(ctx context.Context, i time.Sleep(interval) continue } - common.SetSwitchFaultCode(errCodes) - time.Sleep(interval) } } @@ -241,7 +238,8 @@ func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { var errInfoArray [maxFaultNum]C.struct_LqDcmiEvent if retCode := C.lq_dcmi_get_fault_info(C.uint(maxFaultNum), &errCount, &errInfoArray[0]); int32(retCode) != devmanagercommon.Success { - return []common.SwitchFaultEvent{}, fmt.Errorf("failed to get switch device errorcodes, errCode:%v", retCode) + return []common.SwitchFaultEvent{}, fmt.Errorf("failed to get switch device errorcodes, "+ + "errCode:%v", retCode) } if int32(errCount) < 0 || int32(errCount) > maxFaultNum { return []common.SwitchFaultEvent{}, fmt.Errorf("failed to get switch device errcodes, "+ @@ -292,11 +290,12 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { faultID, ok := uint(0), false // todo eventType为5的情况下,需要依赖checkId进行判断,需要确定后续checkId的稳定性 if event.EventType == 5 { - if event.SubType == 448 { + switch event.SubType { + case 448: faultID = uint(132333) - } else if event.SubType == 449 { + case 449: faultID = uint(132332) - } else if event.SubType == 8 { + case 8: // todo 8的情况下暂时没有对应的故障id,待确认如何处理 } } else { -- Gitee From 63d451907e77ac8d04c967f3dbef9547f590de9a Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Sat, 28 Sep 2024 11:48:17 +0800 Subject: [PATCH 12/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/constants.go | 7 +++++++ pkg/device/deviceswitch/ascend_switch.go | 15 +++++++-------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/pkg/common/constants.go b/pkg/common/constants.go index f478db6d..de950fc4 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -725,6 +725,13 @@ const ( FaultSeverityCritical ) +// peer device type of switch +const ( + PeerDeviceChipOrCpuPort = 0 + PeerDeviceNpuPort = 1 + PeerDeviceL2Port = 2 +) + // LogicID list for reset, get id list of ring const ( ManuallySeparateNpuFirstHandle = "FirstHandle" diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 7007f781..052f1843 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -109,10 +109,11 @@ var ( // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, // this kind of fault code will be faultcode * 1000 + PeerPortDevice duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} - // EventTypeToFaultIDMapper todo 14 15 需要单独确认故障码中的告警id和故障id形式 + // EventTypeToFaultIDMapper todo 14 15 需要单独确认故障码中的告警id和故障id形式 需对齐 EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} - // FaultIdToAlarmIdMapper todo 14 15 5/8 需要确定alarmID,5中需要按checkid单独适配 + // FaultIdToAlarmIdMapper todo 14 15 5/8 需要确定alarmID, 5中 448 449已按需特殊适配,8仍需要按checkid单独适配 + // todo 5/8 14 15均需要给出独立的alarmID FaultIdToAlarmIdMapper = map[uint]string{ 155907: "0x00f103b0", 155649: "0x00f103b0", 155904: "0x00f103b0", 132134: "0x00f1ff06", 155910: "0x00f1ff06", 155911: "0x00f1ff06", @@ -296,7 +297,7 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { case 449: faultID = uint(132332) case 8: - // todo 8的情况下暂时没有对应的故障id,待确认如何处理 + // todo 8的情况下暂时没有对应的故障id,待确认如何处理 给一个独立的alarmID } } else { faultID, ok = EventTypeToFaultIDMapper[event.EventType] @@ -313,17 +314,15 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { PeerDeviceType := int(event.PeerPortDevice) PeerDeviceName := "" switch PeerDeviceType { - // todo 提常量 - case 0: - // todo 当前0值对应CPU和芯片自身故障 需要确认 + case common.PeerDeviceChipOrCpuPort: if isPortLevelFault(int(event.EventType)) { PeerDeviceName = "cpu" } else { PeerDeviceName = "na" } - case 1: + case common.PeerDeviceNpuPort: PeerDeviceName = "npu" - case 2: + case common.PeerDeviceL2Port: PeerDeviceName = "L2" } // currently the last part means ports, remain na for now -- Gitee From 52a148f0532db53078e55a28510bcdf1edf65edc Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Sun, 29 Sep 2024 15:31:37 +0800 Subject: [PATCH 13/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/device.go | 11 ++++------- pkg/device/deviceswitch/ascend_switch.go | 7 +------ 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/pkg/common/device.go b/pkg/common/device.go index 53ff46a2..01b064dd 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -103,14 +103,11 @@ func GetSwitchFaultInfo() SwitchFaultInfo { reportFaultCodes := make([]string, 0) for _, faultInfo := range GetSwitchFaultCode() { - // keep those none zero codes - if faultInfo.EventType != 0 { - faultStr, err := getSimpleSwitchFaultStr(faultInfo) - if err != nil { - continue - } - reportFaultCodes = append(reportFaultCodes, faultStr) + faultStr, err := getSimpleSwitchFaultStr(faultInfo) + if err != nil { + continue } + reportFaultCodes = append(reportFaultCodes, faultStr) } return SwitchFaultInfo{ FaultCode: reportFaultCodes, diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 052f1843..045ba4be 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -183,7 +183,6 @@ func goFaultEventHandler(event *C.struct_LqDcmiEvent) { if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { newFaultCodes := make([]common.SwitchFaultEvent, 0) for _, errInfo := range common.GetSwitchFaultCode() { - // todo 验证故障恢复场景 // only in faultEvent and recoverEvent major info is the same it will be thought as recover if !isFaultRecoveredEvent(errInfo, faultEvent) { newFaultCodes = append(newFaultCodes, errInfo) @@ -249,11 +248,8 @@ func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { errorCodes := make([]string, 0) retErrorInfo := make([]common.SwitchFaultEvent, 0) - for i := 0; i < len(errInfoArray); i++ { + for i := 0; i < int(errCount); i++ { faultEvent := convertFaultEvent(&errInfoArray[i]) - if faultEvent.EventType == 0 { - continue - } if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { continue } @@ -279,7 +275,6 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { NotifySerialNum: int(event.notifySerialNum), AlarmRaisedTime: int64(event.alarmRaisedTime), } - // todo 验证这种写法能够改掉 if err := setExtraFaultInfo(&fault); err != nil { hwlog.RunLog.Error(err) } -- Gitee From 26d6e27a39c861d746e5cec52eec511920f1a624 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Sun, 29 Sep 2024 17:59:58 +0800 Subject: [PATCH 14/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 045ba4be..f31d21c7 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -308,17 +308,17 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { } PeerDeviceType := int(event.PeerPortDevice) PeerDeviceName := "" - switch PeerDeviceType { - case common.PeerDeviceChipOrCpuPort: - if isPortLevelFault(int(event.EventType)) { + if isPortLevelFault(int(event.EventType)) { + switch PeerDeviceType { + case common.PeerDeviceChipOrCpuPort: PeerDeviceName = "cpu" - } else { - PeerDeviceName = "na" + case common.PeerDeviceNpuPort: + PeerDeviceName = "npu" + case common.PeerDeviceL2Port: + PeerDeviceName = "L2" } - case common.PeerDeviceNpuPort: - PeerDeviceName = "npu" - case common.PeerDeviceL2Port: - PeerDeviceName = "L2" + } else { + PeerDeviceName = "na" } // currently the last part means ports, remain na for now event.AssembledFaultCode = fmt.Sprintf("[%s,%d,%s,na]", alarmID, faultID, PeerDeviceName) -- Gitee From eed2f0ef43edf02a093550a9466a234fdf6c63a6 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Sun, 29 Sep 2024 19:08:40 +0800 Subject: [PATCH 15/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index f31d21c7..ecec31bb 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -109,18 +109,18 @@ var ( // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, // this kind of fault code will be faultcode * 1000 + PeerPortDevice duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} - // EventTypeToFaultIDMapper todo 14 15 需要单独确认故障码中的告警id和故障id形式 需对齐 + // EventTypeToFaultIDMapper 5/449 5/448 need to calculate in code EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} - // FaultIdToAlarmIdMapper todo 14 15 5/8 需要确定alarmID, 5中 448 449已按需特殊适配,8仍需要按checkid单独适配 - // todo 5/8 14 15均需要给出独立的alarmID + // FaultIdToAlarmIdMapper 5/8 need alarmID + // todo 5/8 均需要给出独立的alarmID FaultIdToAlarmIdMapper = map[uint]string{ 155907: "0x00f103b0", 155649: "0x00f103b0", 155904: "0x00f103b0", 132134: "0x00f1ff06", 155910: "0x00f1ff06", 155911: "0x00f1ff06", 155908: "0x00f103b6", 155909: "0x00f103b6", 155912: "0x00f1ff09", 155913: "0x00f1ff09", 155914: "0x00f1ff09", 132332: "0x00f10509", 132333: "0x00f10509", - 14: "OID(1.3.6.1.4.1.2011.381.4.1.1)", 15: "OID(1.3.6.1.4.1.2011.381.4.1.1)"} + } ) // UpdateSwitchFaultLevel update the map recording fault code and it's level, as long as deviceinfo changed @@ -284,7 +284,7 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { // setExtraFaultInfo to convert fault event struct to a standard fault code as [0x00f1ff09,155912,cpu,na] func setExtraFaultInfo(event *common.SwitchFaultEvent) error { faultID, ok := uint(0), false - // todo eventType为5的情况下,需要依赖checkId进行判断,需要确定后续checkId的稳定性 + // while eventType is 5, it depends on subtype to tell its faultID if event.EventType == 5 { switch event.SubType { case 448: -- Gitee From 9b0898fb4267732be26a0d2ad7578980074095b8 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Sun, 29 Sep 2024 11:41:53 +0000 Subject: [PATCH 16/31] =?UTF-8?q?=E6=95=85=E9=9A=9C=E7=A0=81=E9=80=82?= =?UTF-8?q?=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lianjun Zhang Atlas --- build/SwitchFaultCode.json | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index 65522ca1..a86f0563 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -1,19 +1,20 @@ { "NotHandleFaultCodes":[ - "[,155920,na,na]","[,155921,na,na]","[0x00f103b6,155909,na,na]","[0x00f1ff06,155910,na,na]", - "[0x00f1ff09,155913,na,na]" + "[0x00f103b6,155909,na,na]","[0x00f1ff06,155910,na,na]","[0x00f1ff09,155913,na,na]" ], "ReportFaultCodes": [ - "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]","[0x00f1ff09,155914,L2,na]" + "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]" ], "SubHealthFaultCodes": [ - "[,155922,L2,na]","[,155923,na,na]","[0x00f103b6,155908,na,na]","[0x00f1ff09,155912,L2,na]" + "[0x00f103b6,155908,na,na]","[0x00f10509,132332,na,na]","[0x00f10509,132333,na,na]","[0x00f1ff09,155912,L2,na]", + "[0x08520003,,L2,na]" ], "ResetFaultCodes": [ - "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f103b0,155907,na,na]", "[0x00f1ff09,155912,cpu,na]", - "[0x00f1ff09,155912,npu,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,npu,na]" + "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,cpu,na]", + "[0x00f1ff09,155912,npu,na]" ], "SeparateFaultCodes": [ - "[,155922,cpu,na]","[,155922,npu,na]","[0x00f103b0,155907,na,na]", "[0x00f1ff09,155912,na,na]" + "[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,na,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,L2,na]", + "[0x00f1ff09,155914,npu,na]","[0x08520003,,cpu,na]","[0x08520003,,npu,na]" ] -} \ No newline at end of file +}` \ No newline at end of file -- Gitee From 1da76bd6a3de8224d17e3d9f1baba5c58574ace1 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Sun, 29 Sep 2024 19:43:48 +0800 Subject: [PATCH 17/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index ecec31bb..d7fa334a 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -106,14 +106,10 @@ type SwitchDevManager struct { var ( switchInitOnce sync.Once - // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, - // this kind of fault code will be faultcode * 1000 + PeerPortDevice - duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} // EventTypeToFaultIDMapper 5/449 5/448 need to calculate in code EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} // FaultIdToAlarmIdMapper 5/8 need alarmID - // todo 5/8 均需要给出独立的alarmID FaultIdToAlarmIdMapper = map[uint]string{ 155907: "0x00f103b0", 155649: "0x00f103b0", 155904: "0x00f103b0", 132134: "0x00f1ff06", 155910: "0x00f1ff06", 155911: "0x00f1ff06", @@ -291,8 +287,6 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { faultID = uint(132333) case 449: faultID = uint(132332) - case 8: - // todo 8的情况下暂时没有对应的故障id,待确认如何处理 给一个独立的alarmID } } else { faultID, ok = EventTypeToFaultIDMapper[event.EventType] @@ -337,7 +331,6 @@ func isPortLevelFault(eventType int) bool { } // isFaultRecoveredEvent to judge if recoverEvent is the recover event for faultEvent 的故障恢复消息 -// todo 确定这种方式是真的恢复事件 与 故障发生事件一致 func isFaultRecoveredEvent(faultEvent, recoverEvent common.SwitchFaultEvent) bool { if int8(recoverEvent.Assertion) != devmanagercommon.FaultRecover || recoverEvent.Assertion == faultEvent.Assertion { return false -- Gitee From 391c0694f6aad0c89f0b3dd065d1f2d9dc83533b Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Sun, 29 Sep 2024 20:23:34 +0800 Subject: [PATCH 18/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/common/device.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/common/device.go b/pkg/common/device.go index 01b064dd..16d13976 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -142,7 +142,6 @@ func getSwitchFaultLevelAndNodeStatus() (faultLevel string, NodeStatus string) { // getSimpleSwitchFaultStr convert fault info to string to display in switch fault info configmap func getSimpleSwitchFaultStr(faultInfo SwitchFaultEvent) (string, error) { - // todo 确定哪些字段是可以展示的 type simpleSwitchFaultInfo struct { EventType uint AssembledFaultCode string -- Gitee From 2f09248143000c786093ae45ea695ca1afbc8f2f Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Tue, 8 Oct 2024 01:08:12 +0000 Subject: [PATCH 19/31] update build/SwitchFaultCode.json. Signed-off-by: Lianjun Zhang Atlas --- build/SwitchFaultCode.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index a86f0563..4a50a5c9 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -17,4 +17,4 @@ "[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,na,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,L2,na]", "[0x00f1ff09,155914,npu,na]","[0x08520003,,cpu,na]","[0x08520003,,npu,na]" ] -}` \ No newline at end of file +} \ No newline at end of file -- Gitee From 236c7247891bf5f677696eb49fdc1b9273e9fb5c Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Tue, 8 Oct 2024 07:18:46 +0000 Subject: [PATCH 20/31] update build/SwitchFaultCode.json. Signed-off-by: Lianjun Zhang Atlas --- build/SwitchFaultCode.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index 4a50a5c9..ee7adb80 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -10,7 +10,7 @@ "[0x08520003,,L2,na]" ], "ResetFaultCodes": [ - "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,cpu,na]", + "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f1ff09,155912,cpu,na]", "[0x00f1ff09,155912,npu,na]" ], "SeparateFaultCodes": [ -- Gitee From 816a1ed908d870b123110191eb3329dbd2f68d0d Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Tue, 8 Oct 2024 19:49:59 +0800 Subject: [PATCH 21/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/SwitchFaultCode.json | 8 ++------ pkg/common/constants.go | 26 ++++++++++++++++++++---- pkg/common/fault_code.go | 14 ++++++------- pkg/device/ascendcommon.go | 2 +- pkg/device/deviceswitch/ascend_switch.go | 17 ++++++++-------- 5 files changed, 40 insertions(+), 27 deletions(-) diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index ee7adb80..1049e753 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -2,16 +2,12 @@ "NotHandleFaultCodes":[ "[0x00f103b6,155909,na,na]","[0x00f1ff06,155910,na,na]","[0x00f1ff09,155913,na,na]" ], - "ReportFaultCodes": [ - "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]" - ], "SubHealthFaultCodes": [ "[0x00f103b6,155908,na,na]","[0x00f10509,132332,na,na]","[0x00f10509,132333,na,na]","[0x00f1ff09,155912,L2,na]", - "[0x08520003,,L2,na]" + "[0x08520003,,L2,na]", "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]" ], "ResetFaultCodes": [ - "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f1ff09,155912,cpu,na]", - "[0x00f1ff09,155912,npu,na]" + "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f1ff09,155912,cpu,na]","[0x00f1ff09,155912,npu,na]" ], "SeparateFaultCodes": [ "[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,na,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,L2,na]", diff --git a/pkg/common/constants.go b/pkg/common/constants.go index de950fc4..cc9fce8e 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -644,8 +644,8 @@ const ( PollFaultCodeCMMinInterval = 30 // GetSwitchFaultCodeInterval is the interval(second) of get all fault code by get interface GetSwitchFaultCodeInterval = 300 - // MinLengthOfFaultCode [0x00f103b0,155904,na,na] must contain at least "[" "]" - MinLengthOfFaultCode = 2 + // MaxLengthOfFaultCode [0x00f103b0,155904,na,na] must contain at most 50 characters + MaxLengthOfFaultCode = 50 // PartNumOfFaultCode [0x00f103b0,155904,na,na] must have 4 parts PartNumOfFaultCode = 4 // FaultCodeCMName is the name of the configmap that is used to save fault code @@ -727,9 +727,27 @@ const ( // peer device type of switch const ( + // PeerDeviceChipOrCpuPort if peer device is whole chip or cpu the given value should be 0 PeerDeviceChipOrCpuPort = 0 - PeerDeviceNpuPort = 1 - PeerDeviceL2Port = 2 + // PeerDeviceNpuPort 1 means switch contact peer device is npu + PeerDeviceNpuPort = 1 + // PeerDeviceL2Port 1 means switch contact peer device is L2 + PeerDeviceL2Port = 2 +) + +const ( + // EventTypeOfSwitchPortFault the event type of port down fault + EventTypeOfSwitchPortFault = 5 + // SubTypeOfPortDown the subtype of port down fault + SubTypeOfPortDown = 8 + // SubTypeOfPortLaneReduceHalf the subtype of lane reduce to half + SubTypeOfPortLaneReduceHalf = 449 + // SubTypeOfPortLaneReduceQuarter the subtype of lane reduce to quarter + SubTypeOfPortLaneReduceQuarter = 448 + // FaultIdOfPortLaneReduceHalf the fault id of lane reduce to half + FaultIdOfPortLaneReduceHalf = 132332 + // FaultIdOfPortLaneReduceQuarter the fault id of lane reduce to quarter + FaultIdOfPortLaneReduceQuarter = 132333 ) // LogicID list for reset, get id list of ring diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index 27aa591b..85a2575b 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -188,7 +188,6 @@ type faultFileInfo struct { // SwitchFaultFileInfo contains all fault code loading from faultconfig configmap or switchfaultconfig.json type SwitchFaultFileInfo struct { NotHandleFaultCodes []string - ReportFaultCodes []string SubHealthFaultCodes []string ResetFaultCodes []string SeparateFaultCodes []string @@ -496,20 +495,19 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { NotHandleFaultCodes = make([]string, 0, GeneralMapSize) PreSeparateFaultCodes = make([]string, 0, GeneralMapSize) SeparateFaultCodes = make([]string, 0, GeneralMapSize) - invalidFormatInfo := "failed to parse NotHandleFaultCodes faultCode:%v, will ignore it," + + invalidFormatInfo := "failed to parse %s faultCode:%v, will ignore it," + " please check if its format, such as: [0x00f1ff09,155914,cpu,na]" for _, code := range switchFileInfo.NotHandleFaultCodes { if !isValidSwitchFaultCode(code) { - hwlog.RunLog.Warnf(invalidFormatInfo, code) + hwlog.RunLog.Warnf(invalidFormatInfo, "NotHandleFaultCodes", code) continue } NotHandleFaultCodes = append(NotHandleFaultCodes, code) } - switchFileInfo.ReportFaultCodes = append(switchFileInfo.ReportFaultCodes, switchFileInfo.SubHealthFaultCodes...) - for _, code := range switchFileInfo.ReportFaultCodes { + for _, code := range switchFileInfo.SubHealthFaultCodes { if !isValidSwitchFaultCode(code) { - hwlog.RunLog.Warnf(invalidFormatInfo, code) + hwlog.RunLog.Warnf(invalidFormatInfo, "SubHealthFaultCodes", code) continue } PreSeparateFaultCodes = append(PreSeparateFaultCodes, code) @@ -518,7 +516,7 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { switchFileInfo.SeparateFaultCodes = append(switchFileInfo.SeparateFaultCodes, switchFileInfo.ResetFaultCodes...) for _, code := range switchFileInfo.SeparateFaultCodes { if !isValidSwitchFaultCode(code) { - hwlog.RunLog.Warnf(invalidFormatInfo, code) + hwlog.RunLog.Warnf(invalidFormatInfo, "SeparateFaultCodes", code) continue } SeparateFaultCodes = append(SeparateFaultCodes, code) @@ -529,7 +527,7 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { // isValidSwitchFaultCode to judge is a fault code is valid format as [0x00f1ff09,155914,cpu,na] func isValidSwitchFaultCode(code string) bool { - if len(code) <= MinLengthOfFaultCode { + if len(code) > MaxLengthOfFaultCode { return false } if !strings.HasPrefix(code, "[") || !strings.HasSuffix(code, "]") { diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 12a809b9..991d0357 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -27,7 +27,7 @@ import ( "huawei.com/npu-exporter/v6/common-utils/hwlog" "huawei.com/npu-exporter/v6/devmanager" npuCommon "huawei.com/npu-exporter/v6/devmanager/common" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index d7fa334a..b31dd3cb 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -107,7 +107,7 @@ type SwitchDevManager struct { var ( switchInitOnce sync.Once // EventTypeToFaultIDMapper 5/449 5/448 need to calculate in code - EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 13214, 8: 155910, 9: 155911, + EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 132134, 8: 155910, 9: 155911, 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} // FaultIdToAlarmIdMapper 5/8 need alarmID FaultIdToAlarmIdMapper = map[uint]string{ @@ -172,7 +172,7 @@ func (sdm *SwitchDevManager) ShutDownSwitch() { //export goFaultEventHandler func goFaultEventHandler(event *C.struct_LqDcmiEvent) { - // faultEventHandler callback function for subscribe mod, witch will receive fault code when fault happens + // faultEventHandler callback function for subscribe mod, which will receive fault code when fault happens faultEvent := convertFaultEvent(event) hwlog.RunLog.Warnf("switch subscribe got fault:%#v, faultCode:%v", faultEvent, faultEvent.AssembledFaultCode) // for recovered fault, delete them from current fault codes @@ -252,6 +252,7 @@ func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { errorCodes = append(errorCodes, faultEvent.AssembledFaultCode) retErrorInfo = append(retErrorInfo, faultEvent) } + // DO NOT edit this log, if necessary DO inform fault dialog hwlog.RunLog.Warnf("switch of 910A3 get fault codes: %#v", errorCodes) return retErrorInfo, nil } @@ -281,12 +282,12 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { func setExtraFaultInfo(event *common.SwitchFaultEvent) error { faultID, ok := uint(0), false // while eventType is 5, it depends on subtype to tell its faultID - if event.EventType == 5 { + if event.EventType == common.EventTypeOfSwitchPortFault { switch event.SubType { - case 448: - faultID = uint(132333) - case 449: - faultID = uint(132332) + case common.SubTypeOfPortLaneReduceQuarter: + faultID = uint(common.FaultIdOfPortLaneReduceQuarter) + case common.SubTypeOfPortLaneReduceHalf: + faultID = uint(common.FaultIdOfPortLaneReduceHalf) } } else { faultID, ok = EventTypeToFaultIDMapper[event.EventType] @@ -330,7 +331,7 @@ func isPortLevelFault(eventType int) bool { return false } -// isFaultRecoveredEvent to judge if recoverEvent is the recover event for faultEvent 的故障恢复消息 +// isFaultRecoveredEvent to judge if recoverEvent is the recover event for faultEvent func isFaultRecoveredEvent(faultEvent, recoverEvent common.SwitchFaultEvent) bool { if int8(recoverEvent.Assertion) != devmanagercommon.FaultRecover || recoverEvent.Assertion == faultEvent.Assertion { return false -- Gitee From a09026697055f546d80a02354ff5bf10d3b5d9f8 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Tue, 8 Oct 2024 19:50:59 +0800 Subject: [PATCH 22/31] =?UTF-8?q?dp=E9=80=82=E9=85=8D=E6=95=85=E9=9A=9C?= =?UTF-8?q?=E7=A0=81=E6=A0=BC=E5=BC=8F=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index b31dd3cb..69d37b07 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -97,7 +97,6 @@ import "C" const ( maxFaultNum = 128 - subTypeBase = 1000 ) // SwitchDevManager is the manager for switch -- Gitee From d5960db50cc7571214ffaf57ed996d7a5f919818 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Thu, 17 Oct 2024 10:15:11 +0800 Subject: [PATCH 23/31] =?UTF-8?q?5/8=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 69d37b07..f2afecdb 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -280,9 +280,13 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { // setExtraFaultInfo to convert fault event struct to a standard fault code as [0x00f1ff09,155912,cpu,na] func setExtraFaultInfo(event *common.SwitchFaultEvent) error { faultID, ok := uint(0), false + alarmID := "" // while eventType is 5, it depends on subtype to tell its faultID if event.EventType == common.EventTypeOfSwitchPortFault { switch event.SubType { + case common.SubTypeOfPortDown: + faultID = uint(0) + alarmID = "0x08520003" case common.SubTypeOfPortLaneReduceQuarter: faultID = uint(common.FaultIdOfPortLaneReduceQuarter) case common.SubTypeOfPortLaneReduceHalf: @@ -294,9 +298,9 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { hwlog.RunLog.Warnf("failed to find faultID for eventType: %d", event.EventType) } } - - alarmID := "" - alarmID, ok = FaultIdToAlarmIdMapper[faultID] + if alarmID == "" { + alarmID, ok = FaultIdToAlarmIdMapper[faultID] + } if !ok { hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) } @@ -315,7 +319,11 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { PeerDeviceName = "na" } // currently the last part means ports, remain na for now - event.AssembledFaultCode = fmt.Sprintf("[%s,%d,%s,na]", alarmID, faultID, PeerDeviceName) + if faultID == uint(0) { + event.AssembledFaultCode = fmt.Sprintf("[%s,,%s,na]", alarmID, PeerDeviceName) + } else { + event.AssembledFaultCode = fmt.Sprintf("[%s,%d,%s,na]", alarmID, faultID, PeerDeviceName) + } event.FaultID = faultID return nil } -- Gitee From 3952a6e748f217890e4555e17470bac823c6736f Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Thu, 17 Oct 2024 10:21:56 +0800 Subject: [PATCH 24/31] =?UTF-8?q?5/8=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index f2afecdb..b1e9ba21 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -325,6 +325,8 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { event.AssembledFaultCode = fmt.Sprintf("[%s,%d,%s,na]", alarmID, faultID, PeerDeviceName) } event.FaultID = faultID + // switch will not report alarm time, use received time as alarm raised time + event.AlarmRaisedTime = time.Now().Unix() return nil } -- Gitee From e94aab93c6313d257cb84ff469076146aa72974b Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Thu, 17 Oct 2024 10:24:50 +0800 Subject: [PATCH 25/31] =?UTF-8?q?5/8=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index b1e9ba21..2c0edadf 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -279,8 +279,7 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { // setExtraFaultInfo to convert fault event struct to a standard fault code as [0x00f1ff09,155912,cpu,na] func setExtraFaultInfo(event *common.SwitchFaultEvent) error { - faultID, ok := uint(0), false - alarmID := "" + faultID, ok, alarmID := uint(0), false, "" // while eventType is 5, it depends on subtype to tell its faultID if event.EventType == common.EventTypeOfSwitchPortFault { switch event.SubType { @@ -304,8 +303,7 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { if !ok { hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) } - PeerDeviceType := int(event.PeerPortDevice) - PeerDeviceName := "" + PeerDeviceType, PeerDeviceName := int(event.PeerPortDevice), "" if isPortLevelFault(int(event.EventType)) { switch PeerDeviceType { case common.PeerDeviceChipOrCpuPort: @@ -325,8 +323,9 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { event.AssembledFaultCode = fmt.Sprintf("[%s,%d,%s,na]", alarmID, faultID, PeerDeviceName) } event.FaultID = faultID - // switch will not report alarm time, use received time as alarm raised time - event.AlarmRaisedTime = time.Now().Unix() + if event.AlarmRaisedTime == int64(0) { + event.AlarmRaisedTime = time.Now().Unix() + } return nil } -- Gitee From 0ca916f1c92d5bfff44fd06a49b3e83b4c251d1b Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Sat, 19 Oct 2024 14:16:30 +0800 Subject: [PATCH 26/31] =?UTF-8?q?=E4=B8=8D=E8=AF=86=E5=88=AB=E6=95=85?= =?UTF-8?q?=E9=9A=9C=E7=A0=81=E4=B8=8D=E4=B8=8A=E6=8A=A5=EF=BC=8C=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0155913=E6=95=85=E9=9A=9C=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/SwitchFaultCode.json | 5 +++-- pkg/common/device.go | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index 1049e753..8600ae2f 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -4,10 +4,11 @@ ], "SubHealthFaultCodes": [ "[0x00f103b6,155908,na,na]","[0x00f10509,132332,na,na]","[0x00f10509,132333,na,na]","[0x00f1ff09,155912,L2,na]", - "[0x08520003,,L2,na]", "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]" + "[0x00f1ff09,155913,L2,na]", "[0x08520003,,L2,na]", "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]" ], "ResetFaultCodes": [ - "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f1ff09,155912,cpu,na]","[0x00f1ff09,155912,npu,na]" + "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f1ff09,155912,cpu,na]","[0x00f1ff09,155912,npu,na]", + "[0x00f1ff09,155913,cpu,na]","[0x00f1ff09,155913,npu,na]" ], "SeparateFaultCodes": [ "[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,na,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,L2,na]", diff --git a/pkg/common/device.go b/pkg/common/device.go index 16d13976..c0f47c0c 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -103,6 +103,10 @@ func GetSwitchFaultInfo() SwitchFaultInfo { reportFaultCodes := make([]string, 0) for _, faultInfo := range GetSwitchFaultCode() { + if _, ok := SwitchFaultLevelMap[faultInfo.AssembledFaultCode]; !ok { + hwlog.RunLog.Warnf("received unregisterd faultCode:%s, will not report", faultInfo.AssembledFaultCode) + continue + } faultStr, err := getSimpleSwitchFaultStr(faultInfo) if err != nil { continue -- Gitee From 082fe91397e35a53185cec016e881df3689ea245 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Mon, 21 Oct 2024 06:48:31 +0000 Subject: [PATCH 27/31] =?UTF-8?q?=E9=99=8Dlane=E6=95=85=E9=9A=9C=E7=A0=81?= =?UTF-8?q?=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lianjun Zhang Atlas --- build/SwitchFaultCode.json | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index 8600ae2f..15350f20 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -3,12 +3,13 @@ "[0x00f103b6,155909,na,na]","[0x00f1ff06,155910,na,na]","[0x00f1ff09,155913,na,na]" ], "SubHealthFaultCodes": [ - "[0x00f103b6,155908,na,na]","[0x00f10509,132332,na,na]","[0x00f10509,132333,na,na]","[0x00f1ff09,155912,L2,na]", - "[0x00f1ff09,155913,L2,na]", "[0x08520003,,L2,na]", "[0x00f1ff06,132134,na,na]","[0x00f1ff06,155911,na,na]" + "[0x00f103b6,155908,na,na]", "[0x00f10509,132332,cpu,na]","[0x00f10509,132332,npu,na]", "[0x00f10509,132332,L2,na]", + "[0x00f10509,132333,cpu,na]","[0x00f10509,132333,npu,na]", "[0x00f10509,132333,L2,na]", "[0x00f1ff06,132134,na,na]", + "[0x00f1ff06,155911,na,na]","[0x00f1ff09,155912,L2,na]","[0x00f1ff09,155913,L2,na]", "[0x08520003,,L2,na]" ], "ResetFaultCodes": [ - "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f1ff09,155912,cpu,na]","[0x00f1ff09,155912,npu,na]", - "[0x00f1ff09,155913,cpu,na]","[0x00f1ff09,155913,npu,na]" + "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,cpu,na]", + "[0x00f1ff09,155912,npu,na]", "[0x00f1ff09,155913,cpu,na]","[0x00f1ff09,155913,npu,na]" ], "SeparateFaultCodes": [ "[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,na,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,L2,na]", -- Gitee From 19a236ee53a101aff2d3441c5ee2fe654d556fb6 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Mon, 21 Oct 2024 18:56:38 +0800 Subject: [PATCH 28/31] =?UTF-8?q?=E9=87=8D=E8=AF=95=E6=B5=81=E6=B0=B4?= =?UTF-8?q?=E7=BA=BF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 2c0edadf..62731d1f 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -330,8 +330,8 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { } // isPortLevelFault to judge if a fault is related to whole chip or any of its ports -// if it is whole chip peerdeviceType will be "na" while peerdeivce==0 -// else it is for its chip, peerdeviceType will be "cpu" while peerdeivce==0 +// if it is whole chip peer deviceType will be "na" while peerdeivce==0 +// else it is for its chip, peer deviceType will be "cpu" while peerdeivce==0 func isPortLevelFault(eventType int) bool { if eventType == 3 || eventType == 4 || eventType == 5 || eventType == 14 || eventType == 15 { return true -- Gitee From ae131f29bd46ea1ca79f0b84f3c87ad5abca1dfe Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Tue, 22 Oct 2024 08:14:43 +0000 Subject: [PATCH 29/31] =?UTF-8?q?=E6=97=A5=E5=BF=97=E4=BD=8D=E7=BD=AE?= =?UTF-8?q?=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lianjun Zhang Atlas --- pkg/device/deviceswitch/ascend_switch.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 62731d1f..36d8bbe5 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -299,10 +299,11 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { } if alarmID == "" { alarmID, ok = FaultIdToAlarmIdMapper[faultID] + if !ok { + hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) + } } - if !ok { - hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) - } + PeerDeviceType, PeerDeviceName := int(event.PeerPortDevice), "" if isPortLevelFault(int(event.EventType)) { switch PeerDeviceType { -- Gitee From 608be34ef55efcf3a027aeca7d81e2d9fe70e111 Mon Sep 17 00:00:00 2001 From: Lianjun Zhang Atlas Date: Tue, 22 Oct 2024 08:31:24 +0000 Subject: [PATCH 30/31] =?UTF-8?q?=E5=A4=8D=E6=9D=82=E5=BA=A6cleancode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Lianjun Zhang Atlas --- pkg/device/deviceswitch/ascend_switch.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 36d8bbe5..a0127fb0 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -298,8 +298,7 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { } } if alarmID == "" { - alarmID, ok = FaultIdToAlarmIdMapper[faultID] - if !ok { + if alarmID, ok = FaultIdToAlarmIdMapper[faultID];!ok{ hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) } } -- Gitee From efd4325c8808677914354cd7c7d4068c2e8d9aa2 Mon Sep 17 00:00:00 2001 From: lianjun-zhang Date: Tue, 22 Oct 2024 16:37:42 +0800 Subject: [PATCH 31/31] =?UTF-8?q?=E5=A4=84=E7=90=86cleancode?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkg/device/deviceswitch/ascend_switch.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index a0127fb0..ded70683 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -285,7 +285,7 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { switch event.SubType { case common.SubTypeOfPortDown: faultID = uint(0) - alarmID = "0x08520003" + alarmID, ok = "0x08520003", true case common.SubTypeOfPortLaneReduceQuarter: faultID = uint(common.FaultIdOfPortLaneReduceQuarter) case common.SubTypeOfPortLaneReduceHalf: @@ -298,11 +298,11 @@ func setExtraFaultInfo(event *common.SwitchFaultEvent) error { } } if alarmID == "" { - if alarmID, ok = FaultIdToAlarmIdMapper[faultID];!ok{ - hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) - } + alarmID, ok = FaultIdToAlarmIdMapper[faultID] + } + if !ok { + hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) } - PeerDeviceType, PeerDeviceName := int(event.PeerPortDevice), "" if isPortLevelFault(int(event.EventType)) { switch PeerDeviceType { -- Gitee