diff --git a/build/SwitchFaultCode.json b/build/SwitchFaultCode.json index ed3d1d2cc8231aaac7ac8bd72b14a4f4cb13cad5..15350f20de75e6df81b9720bdaf5b65455b9b827 100644 --- a/build/SwitchFaultCode.json +++ b/build/SwitchFaultCode.json @@ -1,19 +1,18 @@ { - "NotHandleFaultCodes":["0000002d"], - "ReportFaultCodes": [ - "00000002", "00000003", "0000520a", "00000055", "00000056", "00000057", "00000058", "00000059", "0000005a", - "0000005b", "0000005c", "000001be"], + "NotHandleFaultCodes":[ + "[0x00f103b6,155909,na,na]","[0x00f1ff06,155910,na,na]","[0x00f1ff09,155913,na,na]" + ], "SubHealthFaultCodes": [ - "00000008", "00002712", "00002afa", "0000000c", "0000000d", "0000000f", "00000010", "00000012", "000055f2", - "000059da", "000061aa", "000000d1", "000000d2", "000000d3", "000000d4", "000000d5", "000000d6", "000000d7", - "000000d8", "000000f3", "000000f4", "000000f5", "000000f6", "00000113", "00000114", "00000115", "00000140", - "000001c0", "000001c1","00001f42"], + "[0x00f103b6,155908,na,na]", "[0x00f10509,132332,cpu,na]","[0x00f10509,132332,npu,na]", "[0x00f10509,132332,L2,na]", + "[0x00f10509,132333,cpu,na]","[0x00f10509,132333,npu,na]", "[0x00f10509,132333,L2,na]", "[0x00f1ff06,132134,na,na]", + "[0x00f1ff06,155911,na,na]","[0x00f1ff09,155912,L2,na]","[0x00f1ff09,155913,L2,na]", "[0x08520003,,L2,na]" + ], "ResetFaultCodes": [ - "0000001b", "0000001c", "00002710", "00002af8", "00002711", "00002af9", "0000002e", "0000003a", "00000025", - "0000003b", "0000003c", "0000003f", "00000040", "00005208", "00005209", "000055f0", "000059d8", "000055f1", - "000059d9", "000061a8", "000061a9", "000000cf", "000000d0", "000000f9", "000000fa", "0000014a", "00000151", - "00000148", "00000007", "00000004", "00000005", "00000006"], + "[0x00f103b0,155649,na,na]","[0x00f103b0,155904,na,na]","[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,cpu,na]", + "[0x00f1ff09,155912,npu,na]", "[0x00f1ff09,155913,cpu,na]","[0x00f1ff09,155913,npu,na]" + ], "SeparateFaultCodes": [ - "000000de", "000000ff", "00000116", "00000117", "00000149", "0000014b", "0000014c", "0000014d", "0000014e", - "0000014f", "00001f40", "00001f41"] + "[0x00f103b0,155907,na,na]","[0x00f1ff09,155912,na,na]","[0x00f1ff09,155914,cpu,na]","[0x00f1ff09,155914,L2,na]", + "[0x00f1ff09,155914,npu,na]","[0x08520003,,cpu,na]","[0x08520003,,npu,na]" + ] } \ No newline at end of file diff --git a/pkg/common/constants.go b/pkg/common/constants.go index c80310619f7f0c04e89401e747660ea8398940d3..cc9fce8ecbe392a72aeab82e22957939a82b9865 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -644,6 +644,10 @@ const ( PollFaultCodeCMMinInterval = 30 // GetSwitchFaultCodeInterval is the interval(second) of get all fault code by get interface GetSwitchFaultCodeInterval = 300 + // MaxLengthOfFaultCode [0x00f103b0,155904,na,na] must contain at most 50 characters + MaxLengthOfFaultCode = 50 + // PartNumOfFaultCode [0x00f103b0,155904,na,na] must have 4 parts + PartNumOfFaultCode = 4 // FaultCodeCMName is the name of the configmap that is used to save fault code FaultCodeCMName = "mindx-dl-fault-config" // FaultCodeCMNameSpace is the namespace of the fault code configmap @@ -721,6 +725,31 @@ const ( FaultSeverityCritical ) +// peer device type of switch +const ( + // PeerDeviceChipOrCpuPort if peer device is whole chip or cpu the given value should be 0 + PeerDeviceChipOrCpuPort = 0 + // PeerDeviceNpuPort 1 means switch contact peer device is npu + PeerDeviceNpuPort = 1 + // PeerDeviceL2Port 1 means switch contact peer device is L2 + PeerDeviceL2Port = 2 +) + +const ( + // EventTypeOfSwitchPortFault the event type of port down fault + EventTypeOfSwitchPortFault = 5 + // SubTypeOfPortDown the subtype of port down fault + SubTypeOfPortDown = 8 + // SubTypeOfPortLaneReduceHalf the subtype of lane reduce to half + SubTypeOfPortLaneReduceHalf = 449 + // SubTypeOfPortLaneReduceQuarter the subtype of lane reduce to quarter + SubTypeOfPortLaneReduceQuarter = 448 + // FaultIdOfPortLaneReduceHalf the fault id of lane reduce to half + FaultIdOfPortLaneReduceHalf = 132332 + // FaultIdOfPortLaneReduceQuarter the fault id of lane reduce to quarter + FaultIdOfPortLaneReduceQuarter = 132333 +) + // LogicID list for reset, get id list of ring const ( ManuallySeparateNpuFirstHandle = "FirstHandle" diff --git a/pkg/common/device.go b/pkg/common/device.go index a01c4a1e752e27331ab9ecdd8adadca31e547679..c0f47c0ca7e09a64ef5ce3faad91a28803a85302 100644 --- a/pkg/common/device.go +++ b/pkg/common/device.go @@ -16,6 +16,7 @@ package common import ( + "encoding/json" "fmt" "strconv" "strings" @@ -46,12 +47,12 @@ const ( var ( // SwitchFaultLevelMapLock Lock SwitchFaultLevelMap to avoid concurrence write and read SwitchFaultLevelMapLock sync.Mutex - // SwitchFaultLevelMap record every fault code and it's level - SwitchFaultLevelMap = make(map[int64]int, GeneralMapSize) + // SwitchFaultLevelMap record every switch fault code and it's level + SwitchFaultLevelMap = make(map[string]int, GeneralMapSize) // SwitchFaultLock is used for CurrentSwitchFault which may be used concurrence SwitchFaultLock sync.Mutex // CurrentSwitchFault store all switch fault which will be reported to device-info configmap - currentSwitchFault = make([]int64, 0, GeneralMapSize) + currentSwitchFault = make([]SwitchFaultEvent, 0, GeneralMapSize) ) // GetDeviceID get device physical id and virtual by device name @@ -93,18 +94,43 @@ func GetSwitchFaultInfo() SwitchFaultInfo { if ParamOption.RealCardType != common.Ascend910A3 || !ParamOption.EnableSwitchFault { return SwitchFaultInfo{} } - maxFaultLevel := 0 + SwitchFaultLock.Lock() defer SwitchFaultLock.Unlock() SwitchFaultLevelMapLock.Lock() + + faultLevel, NodeStatus := getSwitchFaultLevelAndNodeStatus() + + reportFaultCodes := make([]string, 0) + for _, faultInfo := range GetSwitchFaultCode() { + if _, ok := SwitchFaultLevelMap[faultInfo.AssembledFaultCode]; !ok { + hwlog.RunLog.Warnf("received unregisterd faultCode:%s, will not report", faultInfo.AssembledFaultCode) + continue + } + faultStr, err := getSimpleSwitchFaultStr(faultInfo) + if err != nil { + continue + } + reportFaultCodes = append(reportFaultCodes, faultStr) + } + return SwitchFaultInfo{ + FaultCode: reportFaultCodes, + FaultLevel: faultLevel, + UpdateTime: time.Now().Unix(), + NodeStatus: NodeStatus, + } +} + +func getSwitchFaultLevelAndNodeStatus() (faultLevel string, NodeStatus string) { + maxFaultLevel := 0 for _, code := range currentSwitchFault { - level := SwitchFaultLevelMap[code] + level := SwitchFaultLevelMap[code.AssembledFaultCode] if level > maxFaultLevel { maxFaultLevel = level } } SwitchFaultLevelMapLock.Unlock() - faultLevel, NodeStatus := NotHandleFaultLevelStr, "Healthy" + faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" switch maxFaultLevel { case NotHandleFaultLevel: faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" @@ -115,31 +141,50 @@ func GetSwitchFaultInfo() SwitchFaultInfo { default: faultLevel, NodeStatus = NotHandleFaultLevelStr, "Healthy" } - // keep those none zero codes - reportFaultCodes := make([]string, 0) - for _, code := range currentSwitchFault { - if code != 0 { - reportFaultCodes = append(reportFaultCodes, fmt.Sprintf("%08x", code)) - } + return faultLevel, NodeStatus +} + +// getSimpleSwitchFaultStr convert fault info to string to display in switch fault info configmap +func getSimpleSwitchFaultStr(faultInfo SwitchFaultEvent) (string, error) { + type simpleSwitchFaultInfo struct { + EventType uint + AssembledFaultCode string + PeerPortDevice uint + PeerPortId uint + SwitchChipId uint + SwitchPortId uint + Severity uint + Assertion uint + AlarmRaisedTime int64 } - return SwitchFaultInfo{ - FaultCode: reportFaultCodes, - FaultLevel: faultLevel, - UpdateTime: time.Now().Unix(), - NodeStatus: NodeStatus, + bytes, err := json.Marshal(simpleSwitchFaultInfo{ + EventType: faultInfo.EventType, + AssembledFaultCode: faultInfo.AssembledFaultCode, + PeerPortDevice: faultInfo.PeerPortDevice, + PeerPortId: faultInfo.PeerPortId, + SwitchChipId: faultInfo.SwitchChipId, + SwitchPortId: faultInfo.SwitchPortId, + Severity: faultInfo.Severity, + Assertion: faultInfo.Assertion, + AlarmRaisedTime: faultInfo.AlarmRaisedTime, + }) + if err != nil { + hwlog.RunLog.Warnf("failed to convert fault:%#v, err: %v", faultInfo, err) + return "", err } + return string(bytes), nil } // SetSwitchFaultCode set switch fault code -func SetSwitchFaultCode(newFaults []int64) { +func SetSwitchFaultCode(newFaults []SwitchFaultEvent) { SwitchFaultLock.Lock() defer SwitchFaultLock.Unlock() currentSwitchFault = newFaults } // GetSwitchFaultCode get switch fault code -func GetSwitchFaultCode() []int64 { +func GetSwitchFaultCode() []SwitchFaultEvent { return currentSwitchFault } diff --git a/pkg/common/device_test.go b/pkg/common/device_test.go index 5aac88b603dc43e9931e126edb392f9cf71f747c..68050e94a6ab5fb87a3af1f26b511bc0c0903014 100644 --- a/pkg/common/device_test.go +++ b/pkg/common/device_test.go @@ -28,7 +28,7 @@ import ( const ( cardNum = 2 - generalFaultCode = 100 + generalFaultCode = "[0x00f103b0,155649,na,NoneExist]" firstFaultIdx = 0 ) @@ -206,24 +206,24 @@ func TestGetSwitchFaultInfo(t *testing.T) { convey.Convey("test GetSwitchFaultInfo", t, func() { ParamOption.RealCardType = common.Ascend910A3 ParamOption.EnableSwitchFault = true - currentSwitchFault = []int64{} - SwitchFaultLevelMap = map[int64]int{} + currentSwitchFault = []SwitchFaultEvent{} + SwitchFaultLevelMap = map[string]int{} convey.Convey("test empty SwitchFaultLevelMap", func() { - currentSwitchFault = append(currentSwitchFault, generalFaultCode) + currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{AssembledFaultCode: generalFaultCode}) fault := GetSwitchFaultInfo() convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) }) convey.Convey("test actually level", func() { - currentSwitchFault = append(currentSwitchFault, generalFaultCode) - SwitchFaultLevelMap = map[int64]int{generalFaultCode: NotHandleFaultLevel} + currentSwitchFault = append(currentSwitchFault, SwitchFaultEvent{AssembledFaultCode: generalFaultCode}) + SwitchFaultLevelMap = map[string]int{generalFaultCode: NotHandleFaultLevel} fault := GetSwitchFaultInfo() convey.So(fault.FaultLevel == NotHandleFaultLevelStr, convey.ShouldBeTrue) - SwitchFaultLevelMap = map[int64]int{generalFaultCode: PreSeparateFaultLevel} + SwitchFaultLevelMap = map[string]int{generalFaultCode: PreSeparateFaultLevel} fault = GetSwitchFaultInfo() convey.So(fault.FaultLevel == PreSeparateFaultLevelStr, convey.ShouldBeTrue) - SwitchFaultLevelMap = map[int64]int{generalFaultCode: SeparateFaultLevel} + SwitchFaultLevelMap = map[string]int{generalFaultCode: SeparateFaultLevel} fault = GetSwitchFaultInfo() convey.So(fault.FaultLevel == SeparateFaultLevelStr, convey.ShouldBeTrue) }) diff --git a/pkg/common/fault_code.go b/pkg/common/fault_code.go index de6a44432065c4b1e81842aa1b10c4a0e485c117..85a2575bb7b79567f9b5dff52a9f5ec79b2d3ebe 100644 --- a/pkg/common/fault_code.go +++ b/pkg/common/fault_code.go @@ -86,11 +86,11 @@ const ( var ( faultTypeCode FaultTypeCode // NotHandleFaultCodes contains all fault code that believed to be not handled, in this case is L1 - NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) + NotHandleFaultCodes = make([]string, 0, GeneralMapSize) // PreSeparateFaultCodes contains all fault code that believed to be PreSeparate, in this case is L2-L3 - PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) + PreSeparateFaultCodes = make([]string, 0, GeneralMapSize) // SeparateFaultCodes contains all fault code that believed to be Separate, in this case is L4-L5 - SeparateFaultCodes = make([]int64, 0, GeneralMapSize) + SeparateFaultCodes = make([]string, 0, GeneralMapSize) // initLogicIDs need init fault code device. add by train or inference initLogicIDs []int32 // logicIDLock operate initLogicIDs lock @@ -188,7 +188,6 @@ type faultFileInfo struct { // SwitchFaultFileInfo contains all fault code loading from faultconfig configmap or switchfaultconfig.json type SwitchFaultFileInfo struct { NotHandleFaultCodes []string - ReportFaultCodes []string SubHealthFaultCodes []string ResetFaultCodes []string SeparateFaultCodes []string @@ -493,42 +492,51 @@ func LoadSwitchFaultCode(switchFaultCodeByte []byte) error { return fmt.Errorf("failed to unmarsha switch fault code, err: %s", err.Error()) } - NotHandleFaultCodes = make([]int64, 0, GeneralMapSize) - PreSeparateFaultCodes = make([]int64, 0, GeneralMapSize) - SeparateFaultCodes = make([]int64, 0, GeneralMapSize) - + NotHandleFaultCodes = make([]string, 0, GeneralMapSize) + PreSeparateFaultCodes = make([]string, 0, GeneralMapSize) + SeparateFaultCodes = make([]string, 0, GeneralMapSize) + invalidFormatInfo := "failed to parse %s faultCode:%v, will ignore it," + + " please check if its format, such as: [0x00f1ff09,155914,cpu,na]" for _, code := range switchFileInfo.NotHandleFaultCodes { - codeInt64, err := strconv.ParseInt(code, Hex, BitSize) - if err != nil { - hwlog.RunLog.Warnf("failed to parse NotHandleFaultCodes faultcode:%v", code) + if !isValidSwitchFaultCode(code) { + hwlog.RunLog.Warnf(invalidFormatInfo, "NotHandleFaultCodes", code) continue } - NotHandleFaultCodes = append(NotHandleFaultCodes, codeInt64) + NotHandleFaultCodes = append(NotHandleFaultCodes, code) } - switchFileInfo.ReportFaultCodes = append(switchFileInfo.ReportFaultCodes, switchFileInfo.SubHealthFaultCodes...) - for _, code := range switchFileInfo.ReportFaultCodes { - codeInt64, err := strconv.ParseInt(code, Hex, BitSize) - if err != nil { - hwlog.RunLog.Warnf("failed to parse PreSeparateFaultCodes:%v", code) + for _, code := range switchFileInfo.SubHealthFaultCodes { + if !isValidSwitchFaultCode(code) { + hwlog.RunLog.Warnf(invalidFormatInfo, "SubHealthFaultCodes", code) continue } - PreSeparateFaultCodes = append(PreSeparateFaultCodes, codeInt64) + PreSeparateFaultCodes = append(PreSeparateFaultCodes, code) } switchFileInfo.SeparateFaultCodes = append(switchFileInfo.SeparateFaultCodes, switchFileInfo.ResetFaultCodes...) for _, code := range switchFileInfo.SeparateFaultCodes { - codeInt64, err := strconv.ParseInt(code, Hex, BitSize) - if err != nil { - hwlog.RunLog.Warnf("failed to parse SeparateFaultCodes:%v", code) + if !isValidSwitchFaultCode(code) { + hwlog.RunLog.Warnf(invalidFormatInfo, "SeparateFaultCodes", code) continue } - SeparateFaultCodes = append(SeparateFaultCodes, codeInt64) + SeparateFaultCodes = append(SeparateFaultCodes, code) } return nil } +// isValidSwitchFaultCode to judge is a fault code is valid format as [0x00f1ff09,155914,cpu,na] +func isValidSwitchFaultCode(code string) bool { + if len(code) > MaxLengthOfFaultCode { + return false + } + if !strings.HasPrefix(code, "[") || !strings.HasSuffix(code, "]") { + return false + } + parts := strings.Split(code, CommaSepDev) + return len(parts) == PartNumOfFaultCode +} + func loadFaultDurationCustomization(customization []FaultDurationCustomization) { handledEventId := make(sets.String, common.MaxErrorCodeCount) for _, cus := range customization { @@ -1269,7 +1277,7 @@ func collectEachFaultEvent(logicId int32, faultInfos []common.DevFaultInfo) { } if faultDurationMap[eventIdStr].Duration == nil { - faultDurationMap[eventIdStr].Duration = make(map[int32]FaultDurationData, 0) + faultDurationMap[eventIdStr].Duration = make(map[int32]FaultDurationData, GeneralMapSize) } if _, ok := faultDurationMap[eventIdStr].Duration[logicId]; !ok { diff --git a/pkg/common/fault_code_test.go b/pkg/common/fault_code_test.go index 28507d399aac42bbbda6e1146f6112bd27d5e831..d0dfdac5ae6854470384641f18e899d2b1ab1ae1 100644 --- a/pkg/common/fault_code_test.go +++ b/pkg/common/fault_code_test.go @@ -18,7 +18,6 @@ package common import ( "encoding/json" "errors" - "fmt" "strconv" "strings" "testing" @@ -1563,7 +1562,7 @@ func TestGetTimeoutFaultCodes(t *testing.T) { func TestLoadSwitchFaultCode(t *testing.T) { convey.Convey("test LoadSwitchFaultCode", t, func() { switchFileInfo := SwitchFaultFileInfo{ - NotHandleFaultCodes: []string{fmt.Sprintf("%08x", generalFaultCode)}, + NotHandleFaultCodes: []string{generalFaultCode}, } bytes, err := json.Marshal(switchFileInfo) convey.So(err, convey.ShouldBeNil) diff --git a/pkg/common/proto.go b/pkg/common/proto.go index c30d56f1034711a0821c721ff0622833911c1ed5..36e59cc7aa3f0acc4fbe33a8e19d9ba3a0ed165a 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -38,6 +38,35 @@ type NodeDeviceInfoCache struct { CheckCode string } +// SwitchFaultEvent is the struct for switch reported fault +type SwitchFaultEvent struct { + EventType uint + // SubType fault subtype used for id a fault + SubType uint + // FaultType To maintain compatibility with next incoming switch fault codes changes + // this filed is reserved for next version + // in this version this field is obtained through manually mapping in device-plugin + // next version this filed will be reported by switch driver + FaultID uint + // AssembledFaultCode is to assemble cgo lq.struct_LqDcmiEvent to a device-plugin recognized fault code type + // such as : [0x00f103b0,155904,na,na] in config file: SwitchFaultCode + AssembledFaultCode string + // PeerPortDevice used to tell what kind of device connected to + PeerPortDevice uint + PeerPortId uint + SwitchChipId uint + SwitchPortId uint + // Severity used to tell how serious is the fault + Severity uint + // Assertion tell what kind of fault, recover, happen or once + Assertion uint + EventSerialNum int + NotifySerialNum int + AlarmRaisedTime int64 + AdditionalParam string + AdditionalInfo string +} + // SwitchFaultInfo Switch Fault Info type SwitchFaultInfo struct { FaultCode []string diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 12a809b9b47aa2a64a462cca89bcb13c64073b64..991d0357bda71e963e317d770ce244d1afbba18d 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -27,7 +27,7 @@ import ( "huawei.com/npu-exporter/v6/common-utils/hwlog" "huawei.com/npu-exporter/v6/devmanager" npuCommon "huawei.com/npu-exporter/v6/devmanager/common" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" diff --git a/pkg/device/deviceswitch/ascend_switch.go b/pkg/device/deviceswitch/ascend_switch.go index 7763fa81527ef2a4376056a78f6c957070204a84..ded7068341e7f94440e980418d932615197cf192 100644 --- a/pkg/device/deviceswitch/ascend_switch.go +++ b/pkg/device/deviceswitch/ascend_switch.go @@ -97,46 +97,32 @@ import "C" const ( maxFaultNum = 128 - subTypeBase = 1000 ) -// SwitchFaultEvent is the struct for switch reported fault -type SwitchFaultEvent struct { - EventType uint - // SubType fault subtype used for id a fault - SubType uint - // PeerPortDevice used to tell what kind of device connected to - PeerPortDevice uint - PeerPortId uint - SwitchChipId uint - SwitchPortId uint - // Severity used to tell how serious is the fault - Severity uint - // Assertion tell what kind of fault, recover, happen or once - Assertion uint - EventSerialNum int - NotifySerialNum int - AlarmRaisedTime int64 - AdditionalParam string - AdditionalInfo string -} - // SwitchDevManager is the manager for switch type SwitchDevManager struct { } var ( switchInitOnce sync.Once - // fault code with subtype like 8,has 3 different kind, with different connect device:NPU CPU Switch, - // this kind of fault code will be faultcode * 1000 + PeerPortDevice - duplicateSubEventMap = map[int]bool{8: true, 10: true, 11: true, 21: true, 22: true, 23: true, 25: true} + // EventTypeToFaultIDMapper 5/449 5/448 need to calculate in code + EventTypeToFaultIDMapper = map[uint]uint{13: 155907, 12: 155649, 11: 155904, 10: 132134, 8: 155910, 9: 155911, + 7: 155908, 6: 155909, 5: 155912, 4: 155913, 3: 155914} + // FaultIdToAlarmIdMapper 5/8 need alarmID + FaultIdToAlarmIdMapper = map[uint]string{ + 155907: "0x00f103b0", 155649: "0x00f103b0", 155904: "0x00f103b0", + 132134: "0x00f1ff06", 155910: "0x00f1ff06", 155911: "0x00f1ff06", + 155908: "0x00f103b6", 155909: "0x00f103b6", + 155912: "0x00f1ff09", 155913: "0x00f1ff09", 155914: "0x00f1ff09", + 132332: "0x00f10509", 132333: "0x00f10509", + } ) // UpdateSwitchFaultLevel update the map recording fault code and it's level, as long as deviceinfo changed func UpdateSwitchFaultLevel() { common.SwitchFaultLevelMapLock.Lock() defer common.SwitchFaultLevelMapLock.Unlock() - common.SwitchFaultLevelMap = make(map[int64]int, common.GeneralMapSize) + common.SwitchFaultLevelMap = make(map[string]int, common.GeneralMapSize) for _, code := range common.NotHandleFaultCodes { common.SwitchFaultLevelMap[code] = common.NotHandleFaultLevel } @@ -185,23 +171,23 @@ func (sdm *SwitchDevManager) ShutDownSwitch() { //export goFaultEventHandler func goFaultEventHandler(event *C.struct_LqDcmiEvent) { - // faultEventHandler callback function for subscribe mod, witch will receive fault code when fault happens + // faultEventHandler callback function for subscribe mod, which will receive fault code when fault happens faultEvent := convertFaultEvent(event) - hwlog.RunLog.Warnf("switch subscribe got fault:%#v, hex:%v", faultEvent, - fmt.Sprintf("%08x", faultEvent.SubType)) + hwlog.RunLog.Warnf("switch subscribe got fault:%#v, faultCode:%v", faultEvent, faultEvent.AssembledFaultCode) // for recovered fault, delete them from current fault codes if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { - newFaultCodes := make([]int64, 0) - for _, code := range common.GetSwitchFaultCode() { - if code != int64(faultEvent.SubType) { - newFaultCodes = append(newFaultCodes, code) + newFaultCodes := make([]common.SwitchFaultEvent, 0) + for _, errInfo := range common.GetSwitchFaultCode() { + // only in faultEvent and recoverEvent major info is the same it will be thought as recover + if !isFaultRecoveredEvent(errInfo, faultEvent) { + newFaultCodes = append(newFaultCodes, errInfo) } } common.SetSwitchFaultCode(newFaultCodes) return } currentFault := common.GetSwitchFaultCode() - common.SetSwitchFaultCode(append(currentFault, int64(faultEvent.SubType))) + common.SetSwitchFaultCode(append(currentFault, faultEvent)) } // GetSwitchFaultCodeByInterval start a none stop loop to query and update switch fault code @@ -223,9 +209,7 @@ func (sdm *SwitchDevManager) GetSwitchFaultCodeByInterval(ctx context.Context, i time.Sleep(interval) continue } - common.SetSwitchFaultCode(errCodes) - time.Sleep(interval) } } @@ -244,35 +228,37 @@ func (sdm *SwitchDevManager) SubscribeSwitchFaults() error { } // GetSwitchFaults will try to get all fault -func GetSwitchFaults() ([]int64, error) { +func GetSwitchFaults() ([]common.SwitchFaultEvent, error) { var errCount C.uint var errInfoArray [maxFaultNum]C.struct_LqDcmiEvent if retCode := C.lq_dcmi_get_fault_info(C.uint(maxFaultNum), &errCount, &errInfoArray[0]); int32(retCode) != devmanagercommon.Success { - return []int64{}, fmt.Errorf("failed to get switch device errorcodes, errCode:%v", retCode) + return []common.SwitchFaultEvent{}, fmt.Errorf("failed to get switch device errorcodes, "+ + "errCode:%v", retCode) } if int32(errCount) < 0 || int32(errCount) > maxFaultNum { - return []int64{}, fmt.Errorf("failed to get switch device errcodes, cause errcodes nums %d is illegal", errCount) + return []common.SwitchFaultEvent{}, fmt.Errorf("failed to get switch device errcodes, "+ + "cause errcodes nums %v is illegal", errCount) } - retErrores := make([]int64, 0) - for i := 0; i < len(errInfoArray); i++ { + errorCodes := make([]string, 0) + retErrorInfo := make([]common.SwitchFaultEvent, 0) + for i := 0; i < int(errCount); i++ { faultEvent := convertFaultEvent(&errInfoArray[i]) - if faultEvent.SubType == 0 { - continue - } if int8(faultEvent.Assertion) == devmanagercommon.FaultRecover { continue } - retErrores = append(retErrores, int64(faultEvent.SubType)) + errorCodes = append(errorCodes, faultEvent.AssembledFaultCode) + retErrorInfo = append(retErrorInfo, faultEvent) } - hwlog.RunLog.Warnf("get fault:%#v", retErrores) - return retErrores, nil + // DO NOT edit this log, if necessary DO inform fault dialog + hwlog.RunLog.Warnf("switch of 910A3 get fault codes: %#v", errorCodes) + return retErrorInfo, nil } // convertFaultEvent convert event getting from driver to go struct -func convertFaultEvent(event *C.struct_LqDcmiEvent) SwitchFaultEvent { - fault := SwitchFaultEvent{ +func convertFaultEvent(event *C.struct_LqDcmiEvent) common.SwitchFaultEvent { + fault := common.SwitchFaultEvent{ EventType: uint(event.eventType), SubType: uint(event.subType), PeerPortDevice: uint(event.peerportDevice), @@ -285,13 +271,86 @@ func convertFaultEvent(event *C.struct_LqDcmiEvent) SwitchFaultEvent { NotifySerialNum: int(event.notifySerialNum), AlarmRaisedTime: int64(event.alarmRaisedTime), } - fault.SubType = getEventType(fault) + if err := setExtraFaultInfo(&fault); err != nil { + hwlog.RunLog.Error(err) + } return fault } -func getEventType(event SwitchFaultEvent) uint { - if duplicate, ok := duplicateSubEventMap[int(event.SubType)]; ok && duplicate { - return event.SubType*subTypeBase + event.PeerPortDevice +// setExtraFaultInfo to convert fault event struct to a standard fault code as [0x00f1ff09,155912,cpu,na] +func setExtraFaultInfo(event *common.SwitchFaultEvent) error { + faultID, ok, alarmID := uint(0), false, "" + // while eventType is 5, it depends on subtype to tell its faultID + if event.EventType == common.EventTypeOfSwitchPortFault { + switch event.SubType { + case common.SubTypeOfPortDown: + faultID = uint(0) + alarmID, ok = "0x08520003", true + case common.SubTypeOfPortLaneReduceQuarter: + faultID = uint(common.FaultIdOfPortLaneReduceQuarter) + case common.SubTypeOfPortLaneReduceHalf: + faultID = uint(common.FaultIdOfPortLaneReduceHalf) + } + } else { + faultID, ok = EventTypeToFaultIDMapper[event.EventType] + if !ok { + hwlog.RunLog.Warnf("failed to find faultID for eventType: %d", event.EventType) + } + } + if alarmID == "" { + alarmID, ok = FaultIdToAlarmIdMapper[faultID] + } + if !ok { + hwlog.RunLog.Warnf("failed to find alarm id for eventType: %d", event.EventType) + } + PeerDeviceType, PeerDeviceName := int(event.PeerPortDevice), "" + if isPortLevelFault(int(event.EventType)) { + switch PeerDeviceType { + case common.PeerDeviceChipOrCpuPort: + PeerDeviceName = "cpu" + case common.PeerDeviceNpuPort: + PeerDeviceName = "npu" + case common.PeerDeviceL2Port: + PeerDeviceName = "L2" + } + } else { + PeerDeviceName = "na" + } + // currently the last part means ports, remain na for now + if faultID == uint(0) { + event.AssembledFaultCode = fmt.Sprintf("[%s,,%s,na]", alarmID, PeerDeviceName) + } else { + event.AssembledFaultCode = fmt.Sprintf("[%s,%d,%s,na]", alarmID, faultID, PeerDeviceName) + } + event.FaultID = faultID + if event.AlarmRaisedTime == int64(0) { + event.AlarmRaisedTime = time.Now().Unix() + } + return nil +} + +// isPortLevelFault to judge if a fault is related to whole chip or any of its ports +// if it is whole chip peer deviceType will be "na" while peerdeivce==0 +// else it is for its chip, peer deviceType will be "cpu" while peerdeivce==0 +func isPortLevelFault(eventType int) bool { + if eventType == 3 || eventType == 4 || eventType == 5 || eventType == 14 || eventType == 15 { + return true + } + return false +} + +// isFaultRecoveredEvent to judge if recoverEvent is the recover event for faultEvent +func isFaultRecoveredEvent(faultEvent, recoverEvent common.SwitchFaultEvent) bool { + if int8(recoverEvent.Assertion) != devmanagercommon.FaultRecover || recoverEvent.Assertion == faultEvent.Assertion { + return false } - return event.SubType + faultEventInfo := fmt.Sprintf("EventType:%v,SubType:%v,FaultID:%v,AssembledFaultCode:%v,"+ + "PeerPortDevice:%v,PeerPortId:%v,SwitchChipId:%v,SwitchPortId:%v", faultEvent.EventType, faultEvent.SubType, + faultEvent.FaultID, faultEvent.AssembledFaultCode, faultEvent.PeerPortDevice, faultEvent.PeerPortId, + faultEvent.SwitchChipId, faultEvent.SwitchPortId) + recoveredEventInfo := fmt.Sprintf("EventType:%v,SubType:%v,FaultID:%v,AssembledFaultCode:%v,"+ + "PeerPortDevice:%v,PeerPortId:%v,SwitchChipId:%v,SwitchPortId:%v", recoverEvent.EventType, recoverEvent.SubType, + recoverEvent.FaultID, recoverEvent.AssembledFaultCode, recoverEvent.PeerPortDevice, recoverEvent.PeerPortId, + recoverEvent.SwitchChipId, recoverEvent.SwitchPortId) + return faultEventInfo == recoveredEventInfo }