From 20f78ced01642a61b22ffa26628edbbdabbe9d09 Mon Sep 17 00:00:00 2001 From: "maofeng.huang" Date: Mon, 24 Nov 2025 13:55:19 +0800 Subject: [PATCH] Release v4.4.0 --- CHANGELOG.md | 10 ++++ README.md | 10 ++++ pkg/ixdcgm/api.go | 5 ++ pkg/ixdcgm/device_status.go | 4 ++ pkg/ixdcgm/hostengine_status.go | 57 +++++++++++++++++++++++ pkg/ixdcgm/utils.go | 10 ++++ samples/devicestatus/main.go | 1 + samples/dmon/main.go | 76 +++++++++++++++++++++++++++++++ samples/hostengine_status/main.go | 24 ++++++++++ 9 files changed, 197 insertions(+) create mode 100644 CHANGELOG.md create mode 100644 pkg/ixdcgm/hostengine_status.go create mode 100644 samples/dmon/main.go create mode 100644 samples/hostengine_status/main.go diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8dc7656 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Go-IXDCGM Changelog + +## v4.4.0 + +- Add dmon example +- Add hostengine status checking + +## v4.3.0 + +- First release diff --git a/README.md b/README.md index 7c84334..3d1965e 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,16 @@ GPU: 0 ``` +To get hostengine status, run the following command: +``` +$ go run samples/hostengine_status/main.go + +# sample output + +Memory : 12072 KB +CPU : 0.00 % +``` + ## License Copyright (c) 2024 Iluvatar CoreX. All rights reserved. This project has an Apache-2.0 license, as diff --git a/pkg/ixdcgm/api.go b/pkg/ixdcgm/api.go index fc749c2..3170bfa 100644 --- a/pkg/ixdcgm/api.go +++ b/pkg/ixdcgm/api.go @@ -116,3 +116,8 @@ func ListenForPolicyViolationsForAllGPUs(ctx context.Context, params *PolicyCond func ListenForPolicyViolationsForGPUs(ctx context.Context, params *PolicyConditionParams, gpuIds ...uint) (<-chan PolicyViolation, error) { return registerPolicyForGpus(ctx, params, gpuIds...) } + +// Introspect returns IxDCGM hostengine memory and CPU usage +func Introspect() (IxDcgmStatus, error) { + return introspect() +} diff --git a/pkg/ixdcgm/device_status.go b/pkg/ixdcgm/device_status.go index f0aac21..8fccba4 100644 --- a/pkg/ixdcgm/device_status.go +++ b/pkg/ixdcgm/device_status.go @@ -79,6 +79,7 @@ type DeviceStatus struct { FanSpeed string // "N/A" or int64 str, % EccSbeVolDev string // "N/A" or int64 str, 1 for errors occurred, 0 for no errors EccDbeVolDev string // "N/A" or int64 str, 1 for errors occurred, 0 for no errors + XidErrors int64 // 0 for no errors } type DeviceProfStatus struct { @@ -104,6 +105,7 @@ func getDeviceStatus(gpuId uint) (status DeviceStatus, err error) { IdxMemTotal IdxMemUsed IdxMemFree + IdxXidErrors ) fields := []Short{ @@ -122,6 +124,7 @@ func getDeviceStatus(gpuId uint) (status DeviceStatus, err error) { DCGM_FI_DEV_FB_TOTAL, DCGM_FI_DEV_FB_USED, DCGM_FI_DEV_FB_FREE, + DCGM_FI_DEV_XID_ERRORS, } fieldGrpName := fmt.Sprintf("devStatusFields%d", rand.Uint64()) @@ -177,6 +180,7 @@ func getDeviceStatus(gpuId uint) (status DeviceStatus, err error) { FanSpeed: GetFieldValueStr(values[IdxFanSpeed], "int64"), EccSbeVolDev: GetFieldValueStr(values[IdxEccSbeVolDev], "int64"), EccDbeVolDev: GetFieldValueStr(values[IdxEccDbeVolDev], "int64"), + XidErrors: values[IdxXidErrors].Int64(), } _ = FieldGroupDestroy(fieldGrp) diff --git a/pkg/ixdcgm/hostengine_status.go b/pkg/ixdcgm/hostengine_status.go new file mode 100644 index 0000000..3b88588 --- /dev/null +++ b/pkg/ixdcgm/hostengine_status.go @@ -0,0 +1,57 @@ +/* +Copyright (c) 2024, NVIDIA CORPORATION. +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" +*/ +import "C" +import "unsafe" + +type IxDcgmStatus struct { + Memory int64 // KB + CPU float64 +} + +func introspect() (engine IxDcgmStatus, err error) { + var memory C.dcgmIntrospectMemory_t + memory.version = makeVersion1(unsafe.Sizeof(memory)) + waitIfNoData := 1 + result := C.dcgmIntrospectGetHostengineMemoryUsage(handle.handle, &memory, C.int(waitIfNoData)) + + if err = errorString(result); err != nil { + return engine, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + var cpu C.dcgmIntrospectCpuUtil_t + + cpu.version = makeVersion1(unsafe.Sizeof(cpu)) + result = C.dcgmIntrospectGetHostengineCpuUtilization(handle.handle, &cpu, C.int(waitIfNoData)) + + if err = errorString(result); err != nil { + return engine, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + engine = IxDcgmStatus{ + Memory: toInt64(memory.bytesUsed) / 1024, + CPU: *dblToFloat(cpu.total) * 100, + } + return +} diff --git a/pkg/ixdcgm/utils.go b/pkg/ixdcgm/utils.go index 5f54ec2..7681ea0 100644 --- a/pkg/ixdcgm/utils.go +++ b/pkg/ixdcgm/utils.go @@ -94,6 +94,16 @@ func ixdcgmErrorString(result C.ixdcgmReturn_t) error { return fmt.Errorf("%v", err) } +func toInt64(c C.longlong) int64 { + i := int64(c) + return i +} + +func dblToFloat(val C.double) *float64 { + i := float64(val) + return &i +} + func string2Char(c string) *C.char { return C.CString(c) } diff --git a/samples/devicestatus/main.go b/samples/devicestatus/main.go index fee6385..67a9105 100644 --- a/samples/devicestatus/main.go +++ b/samples/devicestatus/main.go @@ -66,6 +66,7 @@ func main() { fmt.Printf("Total Memory (MB) : %d\n", st.MemUsage.Total) fmt.Printf("Used Memory (MB) : %d\n", st.MemUsage.Used) fmt.Printf("Free Memory (MB) : %d\n", st.MemUsage.Free) + fmt.Printf("Xid Errors : %d\n", st.XidErrors) fmt.Printf("SmActive : %s\n", pst.SmActive) fmt.Printf("SmOccupancy : %s\n", pst.SmOccupancy) fmt.Printf("DramActive : %s\n", pst.DramActive) diff --git a/samples/dmon/main.go b/samples/dmon/main.go new file mode 100644 index 0000000..8cf1727 --- /dev/null +++ b/samples/dmon/main.go @@ -0,0 +1,76 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "log" + "os" + "os/signal" + "syscall" + "time" + + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" +) + +const ( + header = `# gpu pwr temp sm mem mclk pclk +# Idx W C % % MHz MHz` +) + +// modelled on ixsmi dmon +// ixdcgmi dmon -e 155,150,203,204,100,101 +func main() { + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) + if err != nil { + log.Panicln(err) + } + defer cleanup() + + gpus, err := ixdcgm.GetSupportedDevices() + if err != nil { + log.Panicln(err) + } + + ticker := time.NewTicker(time.Second * 1) + defer ticker.Stop() + + fmt.Println(header) + for { + select { + case <-ticker.C: + for _, gpu := range gpus { + st, err := ixdcgm.GetDeviceStatus(gpu) + if err != nil { + log.Panicln(err) + } + + fmt.Printf("%5d %.5s %5s %5d %5d %5d %5d\n", + gpu, st.Power, st.Temperature, st.Utilization.Gpu, st.Utilization.Mem, + st.Clocks.Mem, st.Clocks.Sm) + } + + case <-sigs: + return + } + } +} diff --git a/samples/hostengine_status/main.go b/samples/hostengine_status/main.go new file mode 100644 index 0000000..c37d54e --- /dev/null +++ b/samples/hostengine_status/main.go @@ -0,0 +1,24 @@ +package main + +import ( + "fmt" + "log" + + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" +) + +// ixdcgmi introspect -s -H +func main() { + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) + if err != nil { + log.Panicln(err) + } + defer cleanup() + + st, err := ixdcgm.Introspect() + if err != nil { + log.Panicln(err) + } + + fmt.Printf("Memory %2s %v KB\nCPU %5s %.2f %s\n", ":", st.Memory, ":", st.CPU, "%") +} -- Gitee