diff --git a/device-plugin b/device-plugin new file mode 100644 index 0000000000000000000000000000000000000000..9b06f89a134c926de01818455a7072c14a2dd9d9 Binary files /dev/null and b/device-plugin differ diff --git a/pkg/server/manager.go b/pkg/server/manager.go index c9ffabcd2d64f59ae78c08221eb46c2cf443b861..e7ab82b2b033ef6187fc66b1970c117d9709e491 100644 --- a/pkg/server/manager.go +++ b/pkg/server/manager.go @@ -16,9 +16,12 @@ package server import ( + "bufio" "context" "encoding/json" "fmt" + "io" + "k8s.io/client-go/kubernetes" "os" "path/filepath" "strconv" @@ -300,6 +303,7 @@ func (hdm *HwDevManager) ListenDevice(ctx context.Context) { // cache to prevent manually separate npu IDs in cache from been lost hdm.separateNPUIDFromDeviceInfoIntoCache() go hdm.pollFaultCodeCM(ctx) + go hdm.getErrCodeFromLog() go hdm.Serve(ctx) initTime := time.Now() for { @@ -824,6 +828,85 @@ func (hdm *HwDevManager) isSupportGraceTolerance() bool { return true } +func readLog(reader *bufio.Reader) { + for { + // 读取一行日志 + line, err := reader.ReadString('\n') + if err == io.EOF { + // 如果到达文件末尾,等待一段时间后重试 + time.Sleep(1 * time.Second) + continue + } else if err != nil { + hwlog.RunLog.Errorf("read err %v", err) + } + hwlog.RunLog.Warnf("get: %v", line) + // 检查是否包含关键字"Err" + if strings.Contains(line, "Err") { + // 输出包含关键字"Err"的日志行 + hwlog.RunLog.Infof("got target: %v", line) + } + } +} + +func readLog2(clientset kubernetes.Interface, namespace, podName string, recordMap map[string]string) { + tailLine := int64(2) + // 获取 Pod 的日志 + req := clientset.CoreV1().Pods(namespace).GetLogs(podName, &v1.PodLogOptions{ + Follow: true, + TailLines: &tailLine, + }) + readCloser, err := req.Stream(context.Background()) + if err != nil { + panic(err) + } + defer readCloser.Close() + + // 读取 Pod 的日志并输出 + buf := make([]byte, 2000) + for { + n, err := readCloser.Read(buf) + if err != nil && err != io.EOF { + hwlog.RunLog.Errorf("read panic err: %v", err) + } + if n > 0 { + hwlog.RunLog.Infof("get log: %s", string(buf[:n])) + } + if err == io.EOF { + hwlog.RunLog.Infof("log finished") + delete(recordMap, namespace+podName) + break + } + time.Sleep(1 * time.Second) + } +} + +func (hdm *HwDevManager) getErrCodeFromLog() { + recordMap := make(map[string]string) + + for { + podList := hdm.manager.GetKubeClient().GetAllPodListCache() + for _, pod := range podList { + for k, _ := range pod.Annotations { + if _, ok := recordMap[pod.Namespace+pod.Name]; ok { + continue + } + + if strings.Contains(k, "huawei") { + if pod.Status.Phase == v1.PodRunning || pod.Status.Phase == v1.PodPending || pod.Status.Phase == v1.PodSucceeded { + continue + } + recordMap[pod.Namespace+pod.Name] = "1" + hwlog.RunLog.Warnf("get pod %v, status is: %v", pod.Name, pod.Status.Phase) + go readLog2(hdm.manager.GetKubeClient().Clientset, pod.Namespace, pod.Name, recordMap) + } + } + + } + + time.Sleep(5 * time.Second) + } +} + func (hdm *HwDevManager) pollFaultCodeCM(ctx context.Context) { var resourceVersion = "" var interval = common.PollFaultCodeCMInterval