136 lines
3.3 KiB
Go
136 lines
3.3 KiB
Go
|
//go:generate ../../../tools/readme_config_includer/generator
|
||
|
package nvidia_smi
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
_ "embed"
|
||
|
"encoding/xml"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"io"
|
||
|
"os"
|
||
|
"os/exec"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
"time"
|
||
|
|
||
|
"github.com/influxdata/telegraf"
|
||
|
"github.com/influxdata/telegraf/config"
|
||
|
"github.com/influxdata/telegraf/internal"
|
||
|
"github.com/influxdata/telegraf/plugins/inputs"
|
||
|
"github.com/influxdata/telegraf/plugins/inputs/nvidia_smi/schema_v11"
|
||
|
"github.com/influxdata/telegraf/plugins/inputs/nvidia_smi/schema_v12"
|
||
|
)
|
||
|
|
||
|
//go:embed sample.conf
|
||
|
var sampleConfig string
|
||
|
|
||
|
type NvidiaSMI struct {
|
||
|
BinPath string `toml:"bin_path"`
|
||
|
Timeout config.Duration `toml:"timeout"`
|
||
|
Log telegraf.Logger `toml:"-"`
|
||
|
|
||
|
nvidiaSMIArgs []string
|
||
|
ignorePlugin bool
|
||
|
once sync.Once
|
||
|
}
|
||
|
|
||
|
func (*NvidiaSMI) SampleConfig() string {
|
||
|
return sampleConfig
|
||
|
}
|
||
|
|
||
|
func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
|
||
|
if _, err := os.Stat(smi.BinPath); os.IsNotExist(err) {
|
||
|
binPath, err := exec.LookPath("nvidia-smi")
|
||
|
if err != nil {
|
||
|
return &internal.StartupError{Err: err}
|
||
|
}
|
||
|
smi.BinPath = binPath
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (*NvidiaSMI) Stop() {}
|
||
|
|
||
|
func (smi *NvidiaSMI) Probe() error {
|
||
|
// Construct and execute metrics query
|
||
|
_, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout))
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
|
||
|
if smi.ignorePlugin {
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Construct and execute metrics query
|
||
|
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout))
|
||
|
if err != nil {
|
||
|
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
|
||
|
}
|
||
|
|
||
|
// Parse the output
|
||
|
return smi.parse(acc, data)
|
||
|
}
|
||
|
|
||
|
func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error {
|
||
|
schema := "v11"
|
||
|
|
||
|
buf := bytes.NewBuffer(data)
|
||
|
decoder := xml.NewDecoder(buf)
|
||
|
for {
|
||
|
token, err := decoder.Token()
|
||
|
if err != nil {
|
||
|
if errors.Is(err, io.EOF) {
|
||
|
break
|
||
|
}
|
||
|
return fmt.Errorf("reading token failed: %w", err)
|
||
|
}
|
||
|
d, ok := token.(xml.Directive)
|
||
|
if !ok {
|
||
|
continue
|
||
|
}
|
||
|
directive := string(d)
|
||
|
if !strings.HasPrefix(directive, "DOCTYPE") {
|
||
|
continue
|
||
|
}
|
||
|
parts := strings.Split(directive, " ")
|
||
|
s := strings.Trim(parts[len(parts)-1], "\" ")
|
||
|
if strings.HasPrefix(s, "nvsmi_device_") && strings.HasSuffix(s, ".dtd") {
|
||
|
schema = strings.TrimSuffix(strings.TrimPrefix(s, "nvsmi_device_"), ".dtd")
|
||
|
} else {
|
||
|
smi.Log.Debugf("Cannot find schema version in %q", directive)
|
||
|
}
|
||
|
break
|
||
|
}
|
||
|
smi.Log.Debugf("Using schema version in %s", schema)
|
||
|
|
||
|
switch schema {
|
||
|
case "v10", "v11":
|
||
|
return schema_v11.Parse(acc, data)
|
||
|
case "v12":
|
||
|
return schema_v12.Parse(acc, data)
|
||
|
}
|
||
|
|
||
|
smi.once.Do(func() {
|
||
|
smi.Log.Warnf(`Unknown schema version %q, using latest know schema for parsing.
|
||
|
Please report this as an issue to https://github.com/influxdata/telegraf together
|
||
|
with a sample output of 'nvidia_smi -q -x'!`, schema)
|
||
|
})
|
||
|
return schema_v12.Parse(acc, data)
|
||
|
}
|
||
|
|
||
|
func init() {
|
||
|
inputs.Add("nvidia_smi", func() telegraf.Input {
|
||
|
return &NvidiaSMI{
|
||
|
BinPath: "/usr/bin/nvidia-smi",
|
||
|
Timeout: config.Duration(5 * time.Second),
|
||
|
nvidiaSMIArgs: []string{"-q", "-x"},
|
||
|
}
|
||
|
})
|
||
|
}
|