1
0
Fork 0
telegraf/plugins/inputs/amd_rocm_smi/amd_rocm_smi.go

306 lines
10 KiB
Go
Raw Normal View History

//go:generate ../../../tools/readme_config_includer/generator
package amd_rocm_smi
import (
_ "embed"
"encoding/json"
"fmt"
"os"
"os/exec"
"strconv"
"strings"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
)
//go:embed sample.conf
var sampleConfig string
const measurement = "amd_rocm_smi"
type ROCmSMI struct {
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`
}
type gpu struct {
DeviceID string `json:"Device ID"`
GpuID string `json:"GPU ID"`
GpuUniqueID string `json:"Unique ID"`
GpuVBIOSVersion string `json:"VBIOS version"`
GpuTemperatureSensorEdge string `json:"Temperature (Sensor edge) (C)"`
GpuTemperatureSensorJunction string `json:"Temperature (Sensor junction) (C)"`
GpuTemperatureSensorMemory string `json:"Temperature (Sensor memory) (C)"`
GpuDcefClkClockSpeed string `json:"dcefclk clock speed:"`
GpuDcefClkClockLevel string `json:"dcefclk clock level:"`
GpuFclkClockSpeed string `json:"fclk clock speed:"`
GpuFclkClockLevel string `json:"fclk clock level:"`
GpuMclkClockSpeed string `json:"mclk clock speed:"`
GpuMclkClockLevel string `json:"mclk clock level:"`
GpuSclkClockSpeed string `json:"sclk clock speed:"`
GpuSclkClockLevel string `json:"sclk clock level:"`
GpuSocclkClockSpeed string `json:"socclk clock speed:"`
GpuSocclkClockLevel string `json:"socclk clock level:"`
GpuPcieClock string `json:"pcie clock level"`
GpuFanSpeedLevel string `json:"Fan speed (level)"`
GpuFanSpeedPercentage string `json:"Fan speed (%)"`
GpuFanRPM string `json:"Fan RPM"`
GpuPerformanceLevel string `json:"Performance Level"`
GpuOverdrive string `json:"GPU OverDrive value (%)"`
GpuMaxPower string `json:"Max Graphics Package Power (W)"`
GpuAveragePower string `json:"Average Graphics Package Power (W)"`
GpuUsePercentage string `json:"GPU use (%)"`
GpuMemoryAllocatedPercentage string `json:"GPU Memory Allocated (VRAM%)"`
GpuMemoryUsePercentage string `json:"GPU memory use (%)"`
GpuMemoryVendor string `json:"GPU memory vendor"`
GpuPCIeReplay string `json:"PCIe Replay Count"`
GpuSerialNumber string `json:"Serial Number"`
GpuVoltagemV string `json:"Voltage (mV)"`
GpuPCIBus string `json:"PCI Bus"`
GpuASDDirmware string `json:"ASD firmware version"`
GpuCEFirmware string `json:"CE firmware version"`
GpuDMCUFirmware string `json:"DMCU firmware version"`
GpuMCFirmware string `json:"MC firmware version"`
GpuMEFirmware string `json:"ME firmware version"`
GpuMECFirmware string `json:"MEC firmware version"`
GpuMEC2Firmware string `json:"MEC2 firmware version"`
GpuPFPFirmware string `json:"PFP firmware version"`
GpuRLCFirmware string `json:"RLC firmware version"`
GpuRLCSRLC string `json:"RLC SRLC firmware version"`
GpuRLCSRLG string `json:"RLC SRLG firmware version"`
GpuRLCSRLS string `json:"RLC SRLS firmware version"`
GpuSDMAFirmware string `json:"SDMA firmware version"`
GpuSDMA2Firmware string `json:"SDMA2 firmware version"`
GpuSMCFirmware string `json:"SMC firmware version"`
GpuSOSFirmware string `json:"SOS firmware version"`
GpuTARAS string `json:"TA RAS firmware version"`
GpuTAXGMI string `json:"TA XGMI firmware version"`
GpuUVDFirmware string `json:"UVD firmware version"`
GpuVCEFirmware string `json:"VCE firmware version"`
GpuVCNFirmware string `json:"VCN firmware version"`
GpuCardSeries string `json:"Card series"`
GpuCardModel string `json:"Card model"`
GpuCardVendor string `json:"Card vendor"`
GpuCardSKU string `json:"Card SKU"`
GpuNUMANode string `json:"(Topology) Numa Node"`
GpuNUMAAffinity string `json:"(Topology) Numa Affinity"`
GpuVisVRAMTotalMemory string `json:"VIS_VRAM Total Memory (B)"`
GpuVisVRAMTotalUsedMemory string `json:"VIS_VRAM Total Used Memory (B)"`
GpuVRAMTotalMemory string `json:"VRAM Total Memory (B)"`
GpuVRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"`
GpuGTTTotalMemory string `json:"GTT Total Memory (B)"`
GpuGTTTotalUsedMemory string `json:"GTT Total Used Memory (B)"`
}
type sysInfo struct {
DriverVersion string `json:"Driver version"`
}
type metric struct {
tags map[string]string
fields map[string]interface{}
}
func (*ROCmSMI) SampleConfig() string {
return sampleConfig
}
func (rsmi *ROCmSMI) Start(telegraf.Accumulator) error {
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
binPath, err := exec.LookPath("rocm-smi")
if err != nil {
return &internal.StartupError{Err: err}
}
rsmi.BinPath = binPath
}
return nil
}
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
data, err := rsmi.pollROCmSMI()
if err != nil {
return fmt.Errorf("failed to execute command in pollROCmSMI: %w", err)
}
return gatherROCmSMI(data, acc)
}
func (*ROCmSMI) Stop() {}
func (rsmi *ROCmSMI) pollROCmSMI() ([]byte, error) {
// Construct and execute metrics query, there currently exist (ROCm v4.3.x) a "-a" option
// that does not provide all the information, so each needed parameter is set manually
cmd := exec.Command(rsmi.BinPath,
"-o",
"-l",
"-m",
"-M",
"-g",
"-c",
"-t",
"-u",
"-i",
"-f",
"-p",
"-P",
"-s",
"-S",
"-v",
"--showreplaycount",
"--showpids",
"--showdriverversion",
"--showmemvendor",
"--showfwinfo",
"--showproductname",
"--showserial",
"--showuniqueid",
"--showbus",
"--showpendingpages",
"--showpagesinfo",
"--showmeminfo",
"all",
"--showretiredpages",
"--showunreservablepages",
"--showmemuse",
"--showvoltage",
"--showtopo",
"--showtopoweight",
"--showtopohops",
"--showtopotype",
"--showtoponuma",
"--json")
return internal.StdOutputTimeout(cmd, time.Duration(rsmi.Timeout))
}
func genTagsFields(gpus map[string]gpu, system map[string]sysInfo) []metric {
metrics := make([]metric, 0, len(gpus))
for cardID := range gpus {
if strings.Contains(cardID, "card") {
tags := map[string]string{
"name": cardID,
}
payload := gpus[cardID]
//nolint:errcheck // silently treat as zero if malformed
totVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalMemory, 10, 64)
//nolint:errcheck // silently treat as zero if malformed
usdVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalUsedMemory, 10, 64)
strFree := strconv.FormatInt(totVRAM-usdVRAM, 10)
// Try using value found in Device ID first. If not found, try GPU
// ID for backwards compatibility.
setTagIfUsed(tags, "gpu_id", payload.DeviceID)
setTagIfUsed(tags, "gpu_id", payload.GpuID)
setTagIfUsed(tags, "gpu_unique_id", payload.GpuUniqueID)
fields := make(map[string]interface{}, 20)
setIfUsed("int", fields, "driver_version", strings.ReplaceAll(system["system"].DriverVersion, ".", ""))
setIfUsed("int", fields, "fan_speed", payload.GpuFanSpeedPercentage)
setIfUsed("int64", fields, "memory_total", payload.GpuVRAMTotalMemory)
setIfUsed("int64", fields, "memory_used", payload.GpuVRAMTotalUsedMemory)
setIfUsed("int64", fields, "memory_free", strFree)
setIfUsed("float", fields, "temperature_sensor_edge", payload.GpuTemperatureSensorEdge)
setIfUsed("float", fields, "temperature_sensor_junction", payload.GpuTemperatureSensorJunction)
setIfUsed("float", fields, "temperature_sensor_memory", payload.GpuTemperatureSensorMemory)
setIfUsed("int", fields, "utilization_gpu", payload.GpuUsePercentage)
// Try using allocated percentage first.
setIfUsed("int", fields, "utilization_memory", payload.GpuMemoryAllocatedPercentage)
setIfUsed("int", fields, "utilization_memory", payload.GpuMemoryUsePercentage)
setIfUsed("int", fields, "clocks_current_sm", strings.Trim(payload.GpuSclkClockSpeed, "(Mhz)"))
setIfUsed("int", fields, "clocks_current_memory", strings.Trim(payload.GpuMclkClockSpeed, "(Mhz)"))
setIfUsed("int", fields, "clocks_current_display", strings.Trim(payload.GpuDcefClkClockSpeed, "(Mhz)"))
setIfUsed("int", fields, "clocks_current_fabric", strings.Trim(payload.GpuFclkClockSpeed, "(Mhz)"))
setIfUsed("int", fields, "clocks_current_system", strings.Trim(payload.GpuSocclkClockSpeed, "(Mhz)"))
setIfUsed("float", fields, "power_draw", payload.GpuAveragePower)
setIfUsed("str", fields, "card_series", payload.GpuCardSeries)
setIfUsed("str", fields, "card_model", payload.GpuCardModel)
setIfUsed("str", fields, "card_vendor", payload.GpuCardVendor)
metrics = append(metrics, metric{tags, fields})
}
}
return metrics
}
func gatherROCmSMI(ret []byte, acc telegraf.Accumulator) error {
var gpus map[string]gpu
var sys map[string]sysInfo
err1 := json.Unmarshal(ret, &gpus)
if err1 != nil {
return err1
}
err2 := json.Unmarshal(ret, &sys)
if err2 != nil {
return err2
}
metrics := genTagsFields(gpus, sys)
for _, metric := range metrics {
acc.AddFields(measurement, metric.fields, metric.tags)
}
return nil
}
func setTagIfUsed(m map[string]string, k, v string) {
if v != "" {
m[k] = v
}
}
func setIfUsed(t string, m map[string]interface{}, k, v string) {
vals := strings.Fields(v)
if len(vals) < 1 {
return
}
val := vals[0]
switch t {
case "float":
if val != "" {
f, err := strconv.ParseFloat(val, 64)
if err == nil {
m[k] = f
}
}
case "int":
if val != "" {
i, err := strconv.Atoi(val)
if err == nil {
m[k] = i
}
}
case "int64":
if val != "" {
i, err := strconv.ParseInt(val, 10, 64)
if err == nil {
m[k] = i
}
}
case "str":
if val != "" {
m[k] = val
}
}
}
func init() {
inputs.Add("amd_rocm_smi", func() telegraf.Input {
return &ROCmSMI{
BinPath: "/opt/rocm/bin/rocm-smi",
Timeout: config.Duration(5 * time.Second),
}
})
}