305 lines
10 KiB
Go
305 lines
10 KiB
Go
//go:generate ../../../tools/readme_config_includer/generator
|
|
package amd_rocm_smi
|
|
|
|
import (
|
|
_ "embed"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/influxdata/telegraf"
|
|
"github.com/influxdata/telegraf/config"
|
|
"github.com/influxdata/telegraf/internal"
|
|
"github.com/influxdata/telegraf/plugins/inputs"
|
|
)
|
|
|
|
//go:embed sample.conf
|
|
var sampleConfig string
|
|
|
|
const measurement = "amd_rocm_smi"
|
|
|
|
type ROCmSMI struct {
|
|
BinPath string `toml:"bin_path"`
|
|
Timeout config.Duration `toml:"timeout"`
|
|
Log telegraf.Logger `toml:"-"`
|
|
}
|
|
|
|
type gpu struct {
|
|
DeviceID string `json:"Device ID"`
|
|
GpuID string `json:"GPU ID"`
|
|
GpuUniqueID string `json:"Unique ID"`
|
|
GpuVBIOSVersion string `json:"VBIOS version"`
|
|
GpuTemperatureSensorEdge string `json:"Temperature (Sensor edge) (C)"`
|
|
GpuTemperatureSensorJunction string `json:"Temperature (Sensor junction) (C)"`
|
|
GpuTemperatureSensorMemory string `json:"Temperature (Sensor memory) (C)"`
|
|
GpuDcefClkClockSpeed string `json:"dcefclk clock speed:"`
|
|
GpuDcefClkClockLevel string `json:"dcefclk clock level:"`
|
|
GpuFclkClockSpeed string `json:"fclk clock speed:"`
|
|
GpuFclkClockLevel string `json:"fclk clock level:"`
|
|
GpuMclkClockSpeed string `json:"mclk clock speed:"`
|
|
GpuMclkClockLevel string `json:"mclk clock level:"`
|
|
GpuSclkClockSpeed string `json:"sclk clock speed:"`
|
|
GpuSclkClockLevel string `json:"sclk clock level:"`
|
|
GpuSocclkClockSpeed string `json:"socclk clock speed:"`
|
|
GpuSocclkClockLevel string `json:"socclk clock level:"`
|
|
GpuPcieClock string `json:"pcie clock level"`
|
|
GpuFanSpeedLevel string `json:"Fan speed (level)"`
|
|
GpuFanSpeedPercentage string `json:"Fan speed (%)"`
|
|
GpuFanRPM string `json:"Fan RPM"`
|
|
GpuPerformanceLevel string `json:"Performance Level"`
|
|
GpuOverdrive string `json:"GPU OverDrive value (%)"`
|
|
GpuMaxPower string `json:"Max Graphics Package Power (W)"`
|
|
GpuAveragePower string `json:"Average Graphics Package Power (W)"`
|
|
GpuUsePercentage string `json:"GPU use (%)"`
|
|
GpuMemoryAllocatedPercentage string `json:"GPU Memory Allocated (VRAM%)"`
|
|
GpuMemoryUsePercentage string `json:"GPU memory use (%)"`
|
|
GpuMemoryVendor string `json:"GPU memory vendor"`
|
|
GpuPCIeReplay string `json:"PCIe Replay Count"`
|
|
GpuSerialNumber string `json:"Serial Number"`
|
|
GpuVoltagemV string `json:"Voltage (mV)"`
|
|
GpuPCIBus string `json:"PCI Bus"`
|
|
GpuASDDirmware string `json:"ASD firmware version"`
|
|
GpuCEFirmware string `json:"CE firmware version"`
|
|
GpuDMCUFirmware string `json:"DMCU firmware version"`
|
|
GpuMCFirmware string `json:"MC firmware version"`
|
|
GpuMEFirmware string `json:"ME firmware version"`
|
|
GpuMECFirmware string `json:"MEC firmware version"`
|
|
GpuMEC2Firmware string `json:"MEC2 firmware version"`
|
|
GpuPFPFirmware string `json:"PFP firmware version"`
|
|
GpuRLCFirmware string `json:"RLC firmware version"`
|
|
GpuRLCSRLC string `json:"RLC SRLC firmware version"`
|
|
GpuRLCSRLG string `json:"RLC SRLG firmware version"`
|
|
GpuRLCSRLS string `json:"RLC SRLS firmware version"`
|
|
GpuSDMAFirmware string `json:"SDMA firmware version"`
|
|
GpuSDMA2Firmware string `json:"SDMA2 firmware version"`
|
|
GpuSMCFirmware string `json:"SMC firmware version"`
|
|
GpuSOSFirmware string `json:"SOS firmware version"`
|
|
GpuTARAS string `json:"TA RAS firmware version"`
|
|
GpuTAXGMI string `json:"TA XGMI firmware version"`
|
|
GpuUVDFirmware string `json:"UVD firmware version"`
|
|
GpuVCEFirmware string `json:"VCE firmware version"`
|
|
GpuVCNFirmware string `json:"VCN firmware version"`
|
|
GpuCardSeries string `json:"Card series"`
|
|
GpuCardModel string `json:"Card model"`
|
|
GpuCardVendor string `json:"Card vendor"`
|
|
GpuCardSKU string `json:"Card SKU"`
|
|
GpuNUMANode string `json:"(Topology) Numa Node"`
|
|
GpuNUMAAffinity string `json:"(Topology) Numa Affinity"`
|
|
GpuVisVRAMTotalMemory string `json:"VIS_VRAM Total Memory (B)"`
|
|
GpuVisVRAMTotalUsedMemory string `json:"VIS_VRAM Total Used Memory (B)"`
|
|
GpuVRAMTotalMemory string `json:"VRAM Total Memory (B)"`
|
|
GpuVRAMTotalUsedMemory string `json:"VRAM Total Used Memory (B)"`
|
|
GpuGTTTotalMemory string `json:"GTT Total Memory (B)"`
|
|
GpuGTTTotalUsedMemory string `json:"GTT Total Used Memory (B)"`
|
|
}
|
|
|
|
type sysInfo struct {
|
|
DriverVersion string `json:"Driver version"`
|
|
}
|
|
|
|
type metric struct {
|
|
tags map[string]string
|
|
fields map[string]interface{}
|
|
}
|
|
|
|
func (*ROCmSMI) SampleConfig() string {
|
|
return sampleConfig
|
|
}
|
|
|
|
func (rsmi *ROCmSMI) Start(telegraf.Accumulator) error {
|
|
if _, err := os.Stat(rsmi.BinPath); os.IsNotExist(err) {
|
|
binPath, err := exec.LookPath("rocm-smi")
|
|
if err != nil {
|
|
return &internal.StartupError{Err: err}
|
|
}
|
|
rsmi.BinPath = binPath
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (rsmi *ROCmSMI) Gather(acc telegraf.Accumulator) error {
|
|
data, err := rsmi.pollROCmSMI()
|
|
if err != nil {
|
|
return fmt.Errorf("failed to execute command in pollROCmSMI: %w", err)
|
|
}
|
|
|
|
return gatherROCmSMI(data, acc)
|
|
}
|
|
|
|
func (*ROCmSMI) Stop() {}
|
|
|
|
func (rsmi *ROCmSMI) pollROCmSMI() ([]byte, error) {
|
|
// Construct and execute metrics query, there currently exist (ROCm v4.3.x) a "-a" option
|
|
// that does not provide all the information, so each needed parameter is set manually
|
|
cmd := exec.Command(rsmi.BinPath,
|
|
"-o",
|
|
"-l",
|
|
"-m",
|
|
"-M",
|
|
"-g",
|
|
"-c",
|
|
"-t",
|
|
"-u",
|
|
"-i",
|
|
"-f",
|
|
"-p",
|
|
"-P",
|
|
"-s",
|
|
"-S",
|
|
"-v",
|
|
"--showreplaycount",
|
|
"--showpids",
|
|
"--showdriverversion",
|
|
"--showmemvendor",
|
|
"--showfwinfo",
|
|
"--showproductname",
|
|
"--showserial",
|
|
"--showuniqueid",
|
|
"--showbus",
|
|
"--showpendingpages",
|
|
"--showpagesinfo",
|
|
"--showmeminfo",
|
|
"all",
|
|
"--showretiredpages",
|
|
"--showunreservablepages",
|
|
"--showmemuse",
|
|
"--showvoltage",
|
|
"--showtopo",
|
|
"--showtopoweight",
|
|
"--showtopohops",
|
|
"--showtopotype",
|
|
"--showtoponuma",
|
|
"--json")
|
|
|
|
return internal.StdOutputTimeout(cmd, time.Duration(rsmi.Timeout))
|
|
}
|
|
|
|
func genTagsFields(gpus map[string]gpu, system map[string]sysInfo) []metric {
|
|
metrics := make([]metric, 0, len(gpus))
|
|
for cardID := range gpus {
|
|
if strings.Contains(cardID, "card") {
|
|
tags := map[string]string{
|
|
"name": cardID,
|
|
}
|
|
|
|
payload := gpus[cardID]
|
|
//nolint:errcheck // silently treat as zero if malformed
|
|
totVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalMemory, 10, 64)
|
|
//nolint:errcheck // silently treat as zero if malformed
|
|
usdVRAM, _ := strconv.ParseInt(payload.GpuVRAMTotalUsedMemory, 10, 64)
|
|
strFree := strconv.FormatInt(totVRAM-usdVRAM, 10)
|
|
|
|
// Try using value found in Device ID first. If not found, try GPU
|
|
// ID for backwards compatibility.
|
|
setTagIfUsed(tags, "gpu_id", payload.DeviceID)
|
|
setTagIfUsed(tags, "gpu_id", payload.GpuID)
|
|
|
|
setTagIfUsed(tags, "gpu_unique_id", payload.GpuUniqueID)
|
|
|
|
fields := make(map[string]interface{}, 20)
|
|
setIfUsed("int", fields, "driver_version", strings.ReplaceAll(system["system"].DriverVersion, ".", ""))
|
|
setIfUsed("int", fields, "fan_speed", payload.GpuFanSpeedPercentage)
|
|
setIfUsed("int64", fields, "memory_total", payload.GpuVRAMTotalMemory)
|
|
setIfUsed("int64", fields, "memory_used", payload.GpuVRAMTotalUsedMemory)
|
|
setIfUsed("int64", fields, "memory_free", strFree)
|
|
setIfUsed("float", fields, "temperature_sensor_edge", payload.GpuTemperatureSensorEdge)
|
|
setIfUsed("float", fields, "temperature_sensor_junction", payload.GpuTemperatureSensorJunction)
|
|
setIfUsed("float", fields, "temperature_sensor_memory", payload.GpuTemperatureSensorMemory)
|
|
setIfUsed("int", fields, "utilization_gpu", payload.GpuUsePercentage)
|
|
// Try using allocated percentage first.
|
|
setIfUsed("int", fields, "utilization_memory", payload.GpuMemoryAllocatedPercentage)
|
|
setIfUsed("int", fields, "utilization_memory", payload.GpuMemoryUsePercentage)
|
|
setIfUsed("int", fields, "clocks_current_sm", strings.Trim(payload.GpuSclkClockSpeed, "(Mhz)"))
|
|
setIfUsed("int", fields, "clocks_current_memory", strings.Trim(payload.GpuMclkClockSpeed, "(Mhz)"))
|
|
setIfUsed("int", fields, "clocks_current_display", strings.Trim(payload.GpuDcefClkClockSpeed, "(Mhz)"))
|
|
setIfUsed("int", fields, "clocks_current_fabric", strings.Trim(payload.GpuFclkClockSpeed, "(Mhz)"))
|
|
setIfUsed("int", fields, "clocks_current_system", strings.Trim(payload.GpuSocclkClockSpeed, "(Mhz)"))
|
|
setIfUsed("float", fields, "power_draw", payload.GpuAveragePower)
|
|
setIfUsed("str", fields, "card_series", payload.GpuCardSeries)
|
|
setIfUsed("str", fields, "card_model", payload.GpuCardModel)
|
|
setIfUsed("str", fields, "card_vendor", payload.GpuCardVendor)
|
|
|
|
metrics = append(metrics, metric{tags, fields})
|
|
}
|
|
}
|
|
return metrics
|
|
}
|
|
|
|
func gatherROCmSMI(ret []byte, acc telegraf.Accumulator) error {
|
|
var gpus map[string]gpu
|
|
var sys map[string]sysInfo
|
|
|
|
err1 := json.Unmarshal(ret, &gpus)
|
|
if err1 != nil {
|
|
return err1
|
|
}
|
|
|
|
err2 := json.Unmarshal(ret, &sys)
|
|
if err2 != nil {
|
|
return err2
|
|
}
|
|
|
|
metrics := genTagsFields(gpus, sys)
|
|
for _, metric := range metrics {
|
|
acc.AddFields(measurement, metric.fields, metric.tags)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func setTagIfUsed(m map[string]string, k, v string) {
|
|
if v != "" {
|
|
m[k] = v
|
|
}
|
|
}
|
|
|
|
func setIfUsed(t string, m map[string]interface{}, k, v string) {
|
|
vals := strings.Fields(v)
|
|
if len(vals) < 1 {
|
|
return
|
|
}
|
|
|
|
val := vals[0]
|
|
|
|
switch t {
|
|
case "float":
|
|
if val != "" {
|
|
f, err := strconv.ParseFloat(val, 64)
|
|
if err == nil {
|
|
m[k] = f
|
|
}
|
|
}
|
|
case "int":
|
|
if val != "" {
|
|
i, err := strconv.Atoi(val)
|
|
if err == nil {
|
|
m[k] = i
|
|
}
|
|
}
|
|
case "int64":
|
|
if val != "" {
|
|
i, err := strconv.ParseInt(val, 10, 64)
|
|
if err == nil {
|
|
m[k] = i
|
|
}
|
|
}
|
|
case "str":
|
|
if val != "" {
|
|
m[k] = val
|
|
}
|
|
}
|
|
}
|
|
|
|
func init() {
|
|
inputs.Add("amd_rocm_smi", func() telegraf.Input {
|
|
return &ROCmSMI{
|
|
BinPath: "/opt/rocm/bin/rocm-smi",
|
|
Timeout: config.Duration(5 * time.Second),
|
|
}
|
|
})
|
|
}
|