1
0
Fork 0
telegraf/plugins/inputs/amd_rocm_smi/amd_rocm_smi_test.go

611 lines
20 KiB
Go
Raw Normal View History

package amd_rocm_smi
import (
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/models"
"github.com/influxdata/telegraf/testutil"
)
func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find rocm-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "amd_rocm_smi",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
require.NotErrorAs(t, model.Start(&acc), &ferr)
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
}
func TestErrorBehaviorError(t *testing.T) {
// make sure we can't find rocm-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "amd_rocm_smi",
StartupErrorBehavior: "error",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
require.NotErrorAs(t, model.Start(&acc), &ferr)
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
}
func TestErrorBehaviorRetry(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "amd_rocm_smi",
StartupErrorBehavior: "retry",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
require.NotErrorAs(t, model.Start(&acc), &ferr)
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
}
func TestErrorBehaviorIgnore(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
plugin := &ROCmSMI{
BinPath: "/random/non-existent/path",
Log: &testutil.Logger{},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "amd_rocm_smi",
StartupErrorBehavior: "ignore",
})
require.NoError(t, model.Init())
var acc testutil.Accumulator
var ferr *internal.FatalError
require.ErrorAs(t, model.Start(&acc), &ferr)
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
}
func TestGatherValidJSON(t *testing.T) {
tests := []struct {
name string
filename string
expected []telegraf.Metric
}{
{
name: "Vega 10 XT",
filename: "vega-10-XT.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x6861",
"gpu_unique_id": "0x2150e7d042a1124",
"name": "card0",
},
map[string]interface{}{
"card_model": "0xc1e",
"card_vendor": "Advanced",
"driver_version": 5925,
"fan_speed": 13,
"memory_total": int64(17163091968),
"memory_used": int64(17776640),
"memory_free": int64(17145315328),
"temperature_sensor_edge": 39.0,
"temperature_sensor_junction": 40.0,
"temperature_sensor_memory": 92.0,
"utilization_gpu": 0,
"clocks_current_display": 600,
"clocks_current_sm": 1269,
"clocks_current_memory": 167,
"clocks_current_system": 960,
"power_draw": 15.0,
},
time.Unix(0, 0)),
},
},
{
name: "Vega 20 WKS GL-XE [Radeon Pro VII]",
filename: "vega-20-WKS-GL-XE.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x66a1",
"gpu_unique_id": "0x2f048617326b1ea",
"name": "card0",
},
map[string]interface{}{
"card_model": "0x834",
"card_series": "Radeon",
"card_vendor": "Advanced",
"driver_version": 5917,
"fan_speed": 0,
"memory_total": int64(34342961152),
"memory_used": int64(10850304),
"memory_free": int64(34332110848),
"temperature_sensor_edge": 36.0,
"temperature_sensor_junction": 38.0,
"temperature_sensor_memory": 35.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_display": 357,
"clocks_current_fabric": 1080,
"clocks_current_sm": 1725,
"clocks_current_memory": 1000,
"clocks_current_system": 971,
"power_draw": 26.0,
},
time.Unix(0, 0),
),
},
},
{
name: "mi100 + ROCm 571",
filename: "mi100_rocm571.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "N/A",
"name": "card0",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 624,
"memory_total": int64(34342961152),
"memory_used": int64(6782976),
"memory_free": int64(34336178176),
"temperature_sensor_edge": 31.0,
"temperature_sensor_junction": 34.0,
"temperature_sensor_memory": 30.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 39.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "N/A",
"name": "card1",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 624,
"memory_total": int64(34342961152),
"memory_used": int64(6782976),
"memory_free": int64(34336178176),
"temperature_sensor_edge": 30.0,
"temperature_sensor_junction": 33.0,
"temperature_sensor_memory": 38.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 37.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "N/A",
"name": "card2",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 624,
"memory_total": int64(34342961152),
"memory_used": int64(6782976),
"memory_free": int64(34336178176),
"temperature_sensor_edge": 31.0,
"temperature_sensor_junction": 34.0,
"temperature_sensor_memory": 31.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 35.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "N/A",
"name": "card3",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 624,
"memory_total": int64(34342961152),
"memory_used": int64(6782976),
"memory_free": int64(34336178176),
"temperature_sensor_edge": 33.0,
"temperature_sensor_junction": 35.0,
"temperature_sensor_memory": 36.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 39.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "N/A",
"name": "card4",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 624,
"memory_total": int64(34342961152),
"memory_used": int64(6782976),
"memory_free": int64(34336178176),
"temperature_sensor_edge": 32.0,
"temperature_sensor_junction": 34.0,
"temperature_sensor_memory": 38.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 39.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "N/A",
"name": "card5",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 624,
"memory_total": int64(34342961152),
"memory_used": int64(6782976),
"memory_free": int64(34336178176),
"temperature_sensor_edge": 33.0,
"temperature_sensor_junction": 35.0,
"temperature_sensor_memory": 38.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 40.0,
},
time.Unix(0, 0),
),
},
},
{
name: "mi100 + ROCm 602",
filename: "mi100_rocm602.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "0x79ccd55167a2124a",
"name": "card0",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 636,
"memory_total": int64(34342961152),
"memory_used": int64(6750208),
"memory_free": int64(34336210944),
"temperature_sensor_edge": 53.0,
"temperature_sensor_junction": 55.0,
"temperature_sensor_memory": 53.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 36.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "0x4edfb117a17a07d",
"name": "card1",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 636,
"memory_total": int64(34342961152),
"memory_used": int64(6750208),
"memory_free": int64(34336210944),
"temperature_sensor_edge": 55.0,
"temperature_sensor_junction": 58.0,
"temperature_sensor_memory": 54.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 44.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "0xd4a9ec48d03d261d",
"name": "card2",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 636,
"memory_total": int64(34342961152),
"memory_used": int64(6750208),
"memory_free": int64(34336210944),
"temperature_sensor_edge": 54.0,
"temperature_sensor_junction": 57.0,
"temperature_sensor_memory": 55.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 43.0,
},
time.Unix(0, 0),
),
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x738c",
"gpu_unique_id": "0x1b9dd972253c3736",
"name": "card3",
},
map[string]interface{}{
"card_model": "0x0c34",
"card_series": "Arcturus",
"card_vendor": "Advanced",
"driver_version": 636,
"memory_total": int64(34342961152),
"memory_used": int64(6750208),
"memory_free": int64(34336210944),
"temperature_sensor_edge": 51.0,
"temperature_sensor_junction": 53.0,
"temperature_sensor_memory": 50.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_fabric": 1402,
"clocks_current_sm": 300,
"clocks_current_memory": 1200,
"clocks_current_system": 1000,
"power_draw": 39.0,
},
time.Unix(0, 0),
),
},
},
{
name: "rx6700xt + ROCm 430",
filename: "rx6700xt_rocm430.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x73df",
"gpu_unique_id": "N/A",
"name": "card0",
},
map[string]interface{}{
"card_model": "0x1002",
"card_series": "0x1002",
"card_vendor": "0x1002",
"driver_version": 636,
"memory_total": int64(12868124672),
"memory_used": int64(1622728704),
"memory_free": int64(11245395968),
"temperature_sensor_edge": 45.0,
"temperature_sensor_junction": 47.0,
"temperature_sensor_memory": 46.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_display": 480,
"clocks_current_fabric": 1051,
"clocks_current_sm": 500,
"clocks_current_memory": 96,
"clocks_current_system": 685,
"power_draw": 6.0,
},
time.Unix(0, 0),
),
},
},
{
name: "rx6700xt + ROCm 571",
filename: "rx6700xt_rocm571.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x73df",
"gpu_unique_id": "N/A",
"name": "card0",
},
map[string]interface{}{
"card_model": "0x6601",
"card_series": "Navi",
"card_vendor": "Advanced",
"driver_version": 636,
"memory_total": int64(12868124672),
"memory_used": int64(1564491776),
"memory_free": int64(11303632896),
"temperature_sensor_edge": 45.0,
"temperature_sensor_junction": 47.0,
"temperature_sensor_memory": 46.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_display": 480,
"clocks_current_fabric": 1051,
"clocks_current_sm": 500,
"clocks_current_memory": 96,
"clocks_current_system": 685,
"power_draw": 6.0,
},
time.Unix(0, 0),
),
},
},
{
name: "rx6700xt + ROCm 602",
filename: "rx6700xt_rocm602.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x73df",
"gpu_unique_id": "N/A",
"name": "card0",
},
map[string]interface{}{
"card_model": "0x6601",
"card_series": "Navi",
"card_vendor": "Advanced",
"driver_version": 636,
"memory_total": int64(12868124672),
"memory_used": int64(1572757504),
"memory_free": int64(11295367168),
"temperature_sensor_edge": 45.0,
"temperature_sensor_junction": 47.0,
"temperature_sensor_memory": 46.0,
"utilization_gpu": 0,
"utilization_memory": 0,
"clocks_current_display": 480,
"clocks_current_fabric": 1051,
"clocks_current_sm": 500,
"clocks_current_memory": 96,
"clocks_current_system": 685,
},
time.Unix(0, 0),
),
},
},
{
name: "rx6700xt + ROCm 612",
filename: "rx6700xt_rocm612.json",
expected: []telegraf.Metric{
testutil.MustMetric(
"amd_rocm_smi",
map[string]string{
"gpu_id": "0x73df",
"gpu_unique_id": "N/A",
"name": "card0",
},
map[string]interface{}{
"card_model": "0x73df",
"card_series": "Navi",
"card_vendor": "Advanced",
"driver_version": 636,
"memory_total": int64(12868124672),
"memory_used": int64(1572745216),
"memory_free": int64(11295379456),
"temperature_sensor_edge": 45.0,
"temperature_sensor_junction": 47.0,
"temperature_sensor_memory": 46.0,
"utilization_gpu": 0,
"utilization_memory": 12,
"clocks_current_display": 480,
"clocks_current_fabric": 1051,
"clocks_current_sm": 0,
"clocks_current_memory": 96,
"clocks_current_system": 685,
"power_draw": 6.0,
},
time.Unix(0, 0),
),
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var acc testutil.Accumulator
octets, err := os.ReadFile(filepath.Join("testdata", tt.filename))
require.NoError(t, err)
err = gatherROCmSMI(octets, &acc)
require.NoError(t, err)
testutil.RequireMetricsEqual(t, tt.expected, acc.GetTelegrafMetrics(), testutil.IgnoreTime(), testutil.SortMetrics())
})
}
}