diff --git a/pkg/common/constants.go b/pkg/common/constants.go index 40d5bce7d4f53e2e9f4b9c92e6d8c94b70d49300..5f0ed8ec653e0c0546c75acc20176e8f04706955 100644 --- a/pkg/common/constants.go +++ b/pkg/common/constants.go @@ -753,3 +753,8 @@ const ( // Subscribe represents subscribe mode Subscribe = "subscribe" ) + +const ( + NPUNormalStatus = "normal" + NPUResettingStatus = "resetting" +) \ No newline at end of file diff --git a/pkg/common/proto.go b/pkg/common/proto.go index 3a9215d8d3adcf1f36836ac3d670d7c74d855658..8a27b86e67dc2ccc072a3e9730dcbe4a3248d509 100644 --- a/pkg/common/proto.go +++ b/pkg/common/proto.go @@ -80,6 +80,7 @@ type NpuDevice struct { LogicID int32 PhyID int32 CardID int32 + Status string } // DavinCiDev davinci device @@ -153,6 +154,7 @@ type DevStatusSet struct { UnHealthyDevice sets.String NetUnHealthyDevice sets.String HealthDevices sets.String + RecoveringDevices sets.String FreeHealthyDevice map[string]sets.String DeviceFault []DeviceFault } diff --git a/pkg/device/ascend910.go b/pkg/device/ascend910.go index 0f87f8a0634a06b02e2ff01129b7c5f2e51d01f4..68823b17701b83fc24b7b0bd6881ca970e1eb559 100644 --- a/pkg/device/ascend910.go +++ b/pkg/device/ascend910.go @@ -216,6 +216,7 @@ func (hnm *HwAscend910Manager) setAllDevUnhealthyOnRing(classifyDevs map[string] for devIndex := startDevIndex; devIndex < endDevIndex; devIndex++ { devStatusList[devIndex].NetworkHealth = v1beta1.Unhealthy devStatusList[devIndex].Health = v1beta1.Unhealthy + devStatusList[devIndex].Status = common.NPUResettingStatus } return nil } diff --git a/pkg/device/ascendcommon.go b/pkg/device/ascendcommon.go index 2b1698e9354e79f39413442a9b67052697d33c4a..903272b6cda32e1734e7891c940e3c3fd52fe5c4 100644 --- a/pkg/device/ascendcommon.go +++ b/pkg/device/ascendcommon.go @@ -388,7 +388,8 @@ func (tool *AscendTools) getRealUsedDevices() sets.String { func (tool *AscendTools) getDevStatesDevSet(classifyDevs map[string][]*common.NpuDevice) common.DevStatusSet { totalFreeDevices := make(map[string]sets.String, len(classifyDevs)) - totalUHDevices, totalNetUHDevices, allTypeUsedDevice := sets.String{}, sets.String{}, sets.String{} + totalUHDevices, totalNetUHDevices, allTypeUsedDevice, totalRCDevices := + sets.String{}, sets.String{}, sets.String{}, sets.String{} totalDeviceFaults := make([]common.DeviceFault, 0, common.GeneralMapSize) if !common.ParamOption.PresetVDevice { allTypeUsedDevice = tool.getRealUsedDevices() @@ -402,18 +403,21 @@ func (tool *AscendTools) getDevStatesDevSet(classifyDevs map[string][]*common.Np } totalUHDevices = totalUHDevices.Union(partDevStatusSet.UnHealthyDevice) totalNetUHDevices = totalNetUHDevices.Union(partDevStatusSet.NetUnHealthyDevice) + totalRCDevices = totalRCDevices.Union(partDevStatusSet.RecoveringDevices) totalDeviceFaults = append(totalDeviceFaults, partDevStatusSet.DeviceFault...) } return common.DevStatusSet{ FreeHealthyDevice: totalFreeDevices, UnHealthyDevice: totalUHDevices, NetUnHealthyDevice: totalNetUHDevices, + RecoveringDevices: totalRCDevices, DeviceFault: totalDeviceFaults, } } func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, runMode string) common.DevStatusSet { - healthDevice, totalUHDevices, totalNetworkUHDevices := sets.String{}, sets.String{}, sets.String{} + healthDevice, totalUHDevices, totalNetworkUHDevices, totalRCDevices := + sets.String{}, sets.String{}, sets.String{}, sets.String{} deviceFaults := make([]common.DeviceFault, 0, common.GeneralMapSize) for _, device := range subClassDevices { if len(device.NetworkFaultCodes) != 0 || device.NetworkHealth == v1beta1.Unhealthy { @@ -439,6 +443,9 @@ func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, if device.NetworkHealth == v1beta1.Unhealthy { totalNetworkUHDevices.Insert(device.DeviceName) } + if device.Status == common.NPUResettingStatus { + totalRCDevices.Insert(device.DeviceName) + } if device.Health == v1beta1.Healthy { healthDevice.Insert(device.DeviceName) continue @@ -458,6 +465,7 @@ func (tool *AscendTools) groupDevsByStatus(subClassDevices []*common.NpuDevice, HealthDevices: healthDevice, UnHealthyDevice: totalUHDevices, NetUnHealthyDevice: totalNetworkUHDevices, + RecoveringDevices: totalRCDevices, DeviceFault: deviceFaults, } }