Adding upstream version 1.34.4.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
e393c3af3f
commit
4978089aab
4963 changed files with 677545 additions and 0 deletions
476
plugins/inputs/slurm/slurm.go
Normal file
476
plugins/inputs/slurm/slurm.go
Normal file
|
@ -0,0 +1,476 @@
|
|||
//go:generate ../../../tools/config_includer/generator
|
||||
//go:generate ../../../tools/readme_config_includer/generator
|
||||
package slurm
|
||||
|
||||
import (
|
||||
"context"
|
||||
_ "embed"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
goslurm "github.com/pcolladosoto/goslurm/v0038"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/config"
|
||||
"github.com/influxdata/telegraf/internal"
|
||||
"github.com/influxdata/telegraf/plugins/common/tls"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
//go:embed sample.conf
|
||||
var sampleConfig string
|
||||
|
||||
type Slurm struct {
|
||||
URL string `toml:"url"`
|
||||
Username string `toml:"username"`
|
||||
Token string `toml:"token"`
|
||||
EnabledEndpoints []string `toml:"enabled_endpoints"`
|
||||
ResponseTimeout config.Duration `toml:"response_timeout"`
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
tls.ClientConfig
|
||||
|
||||
client *goslurm.APIClient
|
||||
baseURL *url.URL
|
||||
endpointMap map[string]bool
|
||||
}
|
||||
|
||||
func (*Slurm) SampleConfig() string {
|
||||
return sampleConfig
|
||||
}
|
||||
|
||||
func (s *Slurm) Init() error {
|
||||
if len(s.EnabledEndpoints) == 0 {
|
||||
s.EnabledEndpoints = []string{"diag", "jobs", "nodes", "partitions", "reservations"}
|
||||
}
|
||||
|
||||
s.endpointMap = make(map[string]bool, len(s.EnabledEndpoints))
|
||||
for _, endpoint := range s.EnabledEndpoints {
|
||||
switch e := strings.ToLower(endpoint); e {
|
||||
case "diag", "jobs", "nodes", "partitions", "reservations":
|
||||
s.endpointMap[e] = true
|
||||
default:
|
||||
return fmt.Errorf("unknown endpoint %q", endpoint)
|
||||
}
|
||||
}
|
||||
|
||||
if s.URL == "" {
|
||||
return errors.New("empty URL provided")
|
||||
}
|
||||
|
||||
u, err := url.Parse(s.URL)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if u.Hostname() == "" {
|
||||
return fmt.Errorf("empty hostname for url %q", s.URL)
|
||||
}
|
||||
|
||||
s.baseURL = u
|
||||
|
||||
if u.Scheme != "http" && u.Scheme != "https" {
|
||||
return fmt.Errorf("invalid scheme %q", u.Scheme)
|
||||
}
|
||||
|
||||
tlsCfg, err := s.ClientConfig.TLSConfig()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if u.Scheme == "http" && tlsCfg != nil {
|
||||
s.Log.Warn("non-empty TLS configuration for a URL with an http scheme. Ignoring it...")
|
||||
tlsCfg = nil
|
||||
}
|
||||
|
||||
configuration := goslurm.NewConfiguration()
|
||||
configuration.Host = u.Host
|
||||
configuration.Scheme = u.Scheme
|
||||
configuration.UserAgent = internal.ProductToken()
|
||||
configuration.HTTPClient = &http.Client{
|
||||
Transport: &http.Transport{
|
||||
TLSClientConfig: tlsCfg,
|
||||
},
|
||||
Timeout: time.Duration(s.ResponseTimeout),
|
||||
}
|
||||
|
||||
s.client = goslurm.NewAPIClient(configuration)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Slurm) Gather(acc telegraf.Accumulator) (err error) {
|
||||
auth := context.WithValue(
|
||||
context.Background(),
|
||||
goslurm.ContextAPIKeys,
|
||||
map[string]goslurm.APIKey{
|
||||
"user": {Key: s.Username},
|
||||
"token": {Key: s.Token},
|
||||
},
|
||||
)
|
||||
|
||||
if s.endpointMap["diag"] {
|
||||
diagResp, respRaw, err := s.client.SlurmAPI.SlurmV0038Diag(auth).Execute()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting diag: %w", err)
|
||||
}
|
||||
if diag, ok := diagResp.GetStatisticsOk(); ok {
|
||||
s.gatherDiagMetrics(acc, diag)
|
||||
}
|
||||
respRaw.Body.Close()
|
||||
}
|
||||
|
||||
if s.endpointMap["jobs"] {
|
||||
jobsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetJobs(auth).Execute()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting jobs: %w", err)
|
||||
}
|
||||
if jobs, ok := jobsResp.GetJobsOk(); ok {
|
||||
s.gatherJobsMetrics(acc, jobs)
|
||||
}
|
||||
respRaw.Body.Close()
|
||||
}
|
||||
|
||||
if s.endpointMap["nodes"] {
|
||||
nodesResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetNodes(auth).Execute()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting nodes: %w", err)
|
||||
}
|
||||
if nodes, ok := nodesResp.GetNodesOk(); ok {
|
||||
s.gatherNodesMetrics(acc, nodes)
|
||||
}
|
||||
respRaw.Body.Close()
|
||||
}
|
||||
|
||||
if s.endpointMap["partitions"] {
|
||||
partitionsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetPartitions(auth).Execute()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting partitions: %w", err)
|
||||
}
|
||||
if partitions, ok := partitionsResp.GetPartitionsOk(); ok {
|
||||
s.gatherPartitionsMetrics(acc, partitions)
|
||||
}
|
||||
respRaw.Body.Close()
|
||||
}
|
||||
|
||||
if s.endpointMap["reservations"] {
|
||||
reservationsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetReservations(auth).Execute()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting reservations: %w", err)
|
||||
}
|
||||
if reservations, ok := reservationsResp.GetReservationsOk(); ok {
|
||||
s.gatherReservationsMetrics(acc, reservations)
|
||||
}
|
||||
respRaw.Body.Close()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func parseTres(tres string) map[string]interface{} {
|
||||
tresKVs := strings.Split(tres, ",")
|
||||
parsedValues := make(map[string]interface{}, len(tresKVs))
|
||||
|
||||
for _, tresVal := range tresKVs {
|
||||
parsedTresVal := strings.Split(tresVal, "=")
|
||||
if len(parsedTresVal) != 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
tag := parsedTresVal[0]
|
||||
val := parsedTresVal[1]
|
||||
var factor float64 = 1
|
||||
|
||||
if tag == "mem" {
|
||||
var ok bool
|
||||
factor, ok = map[string]float64{
|
||||
"K": 1.0 / 1024.0,
|
||||
"M": 1,
|
||||
"G": 1024,
|
||||
"T": 1024 * 1024,
|
||||
"P": 1024 * 1024 * 1024,
|
||||
}[strings.ToUpper(val[len(val)-1:])]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
val = val[:len(val)-1]
|
||||
}
|
||||
|
||||
parsedFloat, err := strconv.ParseFloat(val, 64)
|
||||
if err == nil {
|
||||
parsedValues[tag] = parsedFloat * factor
|
||||
continue
|
||||
}
|
||||
parsedValues[tag] = val
|
||||
}
|
||||
|
||||
return parsedValues
|
||||
}
|
||||
|
||||
func (s *Slurm) gatherDiagMetrics(acc telegraf.Accumulator, diag *goslurm.V0038DiagStatistics) {
|
||||
records := make(map[string]interface{}, 13)
|
||||
tags := map[string]string{"source": s.baseURL.Hostname()}
|
||||
|
||||
if int32Ptr, ok := diag.GetServerThreadCountOk(); ok {
|
||||
records["server_thread_count"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetJobsCanceledOk(); ok {
|
||||
records["jobs_canceled"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetJobsSubmittedOk(); ok {
|
||||
records["jobs_submitted"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetJobsStartedOk(); ok {
|
||||
records["jobs_started"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetJobsCompletedOk(); ok {
|
||||
records["jobs_completed"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetJobsFailedOk(); ok {
|
||||
records["jobs_failed"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetJobsPendingOk(); ok {
|
||||
records["jobs_pending"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetJobsRunningOk(); ok {
|
||||
records["jobs_running"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetScheduleCycleLastOk(); ok {
|
||||
records["schedule_cycle_last"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetScheduleCycleMeanOk(); ok {
|
||||
records["schedule_cycle_mean"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetBfQueueLenOk(); ok {
|
||||
records["bf_queue_len"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := diag.GetBfQueueLenMeanOk(); ok {
|
||||
records["bf_queue_len_mean"] = *int32Ptr
|
||||
}
|
||||
if boolPtr, ok := diag.GetBfActiveOk(); ok {
|
||||
records["bf_active"] = *boolPtr
|
||||
}
|
||||
|
||||
acc.AddFields("slurm_diag", records, tags)
|
||||
}
|
||||
|
||||
func (s *Slurm) gatherJobsMetrics(acc telegraf.Accumulator, jobs []goslurm.V0038JobResponseProperties) {
|
||||
for i := range jobs {
|
||||
records := make(map[string]interface{}, 19)
|
||||
tags := make(map[string]string, 3)
|
||||
|
||||
tags["source"] = s.baseURL.Hostname()
|
||||
if strPtr, ok := jobs[i].GetNameOk(); ok {
|
||||
tags["name"] = *strPtr
|
||||
}
|
||||
if int32Ptr, ok := jobs[i].GetJobIdOk(); ok {
|
||||
tags["job_id"] = strconv.Itoa(int(*int32Ptr))
|
||||
}
|
||||
|
||||
if strPtr, ok := jobs[i].GetJobStateOk(); ok {
|
||||
records["state"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetStateReasonOk(); ok {
|
||||
records["state_reason"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetPartitionOk(); ok {
|
||||
records["partition"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetNodesOk(); ok {
|
||||
records["nodes"] = *strPtr
|
||||
}
|
||||
if int32Ptr, ok := jobs[i].GetNodeCountOk(); ok {
|
||||
records["node_count"] = *int32Ptr
|
||||
}
|
||||
if int64Ptr, ok := jobs[i].GetPriorityOk(); ok {
|
||||
records["priority"] = *int64Ptr
|
||||
}
|
||||
if int32Ptr, ok := jobs[i].GetNiceOk(); ok {
|
||||
records["nice"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := jobs[i].GetGroupIdOk(); ok {
|
||||
records["group_id"] = *int32Ptr
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetCommandOk(); ok {
|
||||
records["command"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetStandardOutputOk(); ok {
|
||||
records["standard_output"] = strings.ReplaceAll(*strPtr, "\\", "")
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetStandardErrorOk(); ok {
|
||||
records["standard_error"] = strings.ReplaceAll(*strPtr, "\\", "")
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetStandardInputOk(); ok {
|
||||
records["standard_input"] = strings.ReplaceAll(*strPtr, "\\", "")
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetCurrentWorkingDirectoryOk(); ok {
|
||||
records["current_working_directory"] = strings.ReplaceAll(*strPtr, "\\", "")
|
||||
}
|
||||
if int64Ptr, ok := jobs[i].GetSubmitTimeOk(); ok {
|
||||
records["submit_time"] = *int64Ptr
|
||||
}
|
||||
if int64Ptr, ok := jobs[i].GetStartTimeOk(); ok {
|
||||
records["start_time"] = *int64Ptr
|
||||
}
|
||||
if int32Ptr, ok := jobs[i].GetCpusOk(); ok {
|
||||
records["cpus"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := jobs[i].GetTasksOk(); ok {
|
||||
records["tasks"] = *int32Ptr
|
||||
}
|
||||
if int64Ptr, ok := jobs[i].GetTimeLimitOk(); ok {
|
||||
records["time_limit"] = *int64Ptr
|
||||
}
|
||||
if strPtr, ok := jobs[i].GetTresReqStrOk(); ok {
|
||||
for k, v := range parseTres(*strPtr) {
|
||||
records["tres_"+k] = v
|
||||
}
|
||||
}
|
||||
|
||||
acc.AddFields("slurm_jobs", records, tags)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Slurm) gatherNodesMetrics(acc telegraf.Accumulator, nodes []goslurm.V0038Node) {
|
||||
for _, node := range nodes {
|
||||
records := make(map[string]interface{}, 13)
|
||||
tags := make(map[string]string, 2)
|
||||
|
||||
tags["source"] = s.baseURL.Hostname()
|
||||
if strPtr, ok := node.GetNameOk(); ok {
|
||||
tags["name"] = *strPtr
|
||||
}
|
||||
|
||||
if strPtr, ok := node.GetStateOk(); ok {
|
||||
records["state"] = *strPtr
|
||||
}
|
||||
if int32Ptr, ok := node.GetCoresOk(); ok {
|
||||
records["cores"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := node.GetCpusOk(); ok {
|
||||
records["cpus"] = *int32Ptr
|
||||
}
|
||||
if int64Ptr, ok := node.GetCpuLoadOk(); ok {
|
||||
records["cpu_load"] = *int64Ptr
|
||||
}
|
||||
if int64Ptr, ok := node.GetAllocCpusOk(); ok {
|
||||
records["alloc_cpu"] = *int64Ptr
|
||||
}
|
||||
if int32Ptr, ok := node.GetRealMemoryOk(); ok {
|
||||
records["real_memory"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := node.GetFreeMemoryOk(); ok {
|
||||
records["free_memory"] = *int32Ptr
|
||||
}
|
||||
if int64Ptr, ok := node.GetAllocMemoryOk(); ok {
|
||||
records["alloc_memory"] = *int64Ptr
|
||||
}
|
||||
if strPtr, ok := node.GetTresOk(); ok {
|
||||
for k, v := range parseTres(*strPtr) {
|
||||
records["tres_"+k] = v
|
||||
}
|
||||
}
|
||||
if strPtr, ok := node.GetTresUsedOk(); ok {
|
||||
for k, v := range parseTres(*strPtr) {
|
||||
records["tres_used_"+k] = v
|
||||
}
|
||||
}
|
||||
if int32Ptr, ok := node.GetWeightOk(); ok {
|
||||
records["weight"] = *int32Ptr
|
||||
}
|
||||
if strPtr, ok := node.GetSlurmdVersionOk(); ok {
|
||||
records["slurmd_version"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := node.GetArchitectureOk(); ok {
|
||||
records["architecture"] = *strPtr
|
||||
}
|
||||
|
||||
acc.AddFields("slurm_nodes", records, tags)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Slurm) gatherPartitionsMetrics(acc telegraf.Accumulator, partitions []goslurm.V0038Partition) {
|
||||
for _, partition := range partitions {
|
||||
records := make(map[string]interface{}, 5)
|
||||
tags := make(map[string]string, 2)
|
||||
|
||||
tags["source"] = s.baseURL.Hostname()
|
||||
if strPtr, ok := partition.GetNameOk(); ok {
|
||||
tags["name"] = *strPtr
|
||||
}
|
||||
|
||||
if strPtr, ok := partition.GetStateOk(); ok {
|
||||
records["state"] = *strPtr
|
||||
}
|
||||
if int32Ptr, ok := partition.GetTotalCpusOk(); ok {
|
||||
records["total_cpu"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := partition.GetTotalNodesOk(); ok {
|
||||
records["total_nodes"] = *int32Ptr
|
||||
}
|
||||
if strPtr, ok := partition.GetNodesOk(); ok {
|
||||
records["nodes"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := partition.GetTresOk(); ok {
|
||||
for k, v := range parseTres(*strPtr) {
|
||||
records["tres_"+k] = v
|
||||
}
|
||||
}
|
||||
|
||||
acc.AddFields("slurm_partitions", records, tags)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Slurm) gatherReservationsMetrics(acc telegraf.Accumulator, reservations []goslurm.V0038Reservation) {
|
||||
for _, reservation := range reservations {
|
||||
records := make(map[string]interface{}, 9)
|
||||
tags := make(map[string]string, 2)
|
||||
|
||||
tags["source"] = s.baseURL.Hostname()
|
||||
if strPtr, ok := reservation.GetNameOk(); ok {
|
||||
tags["name"] = *strPtr
|
||||
}
|
||||
|
||||
if int32Ptr, ok := reservation.GetCoreCountOk(); ok {
|
||||
records["core_count"] = *int32Ptr
|
||||
}
|
||||
if int32Ptr, ok := reservation.GetCoreSpecCntOk(); ok {
|
||||
records["core_spec_count"] = *int32Ptr
|
||||
}
|
||||
if strPtr, ok := reservation.GetGroupsOk(); ok {
|
||||
records["groups"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := reservation.GetUsersOk(); ok {
|
||||
records["users"] = *strPtr
|
||||
}
|
||||
if int32Ptr, ok := reservation.GetStartTimeOk(); ok {
|
||||
records["start_time"] = *int32Ptr
|
||||
}
|
||||
if strPtr, ok := reservation.GetPartitionOk(); ok {
|
||||
records["partition"] = *strPtr
|
||||
}
|
||||
if strPtr, ok := reservation.GetAccountsOk(); ok {
|
||||
records["accounts"] = *strPtr
|
||||
}
|
||||
if int32Ptr, ok := reservation.GetNodeCountOk(); ok {
|
||||
records["node_count"] = *int32Ptr
|
||||
}
|
||||
if strPtr, ok := reservation.GetNodeListOk(); ok {
|
||||
records["node_list"] = *strPtr
|
||||
}
|
||||
|
||||
acc.AddFields("slurm_reservations", records, tags)
|
||||
}
|
||||
}
|
||||
|
||||
func init() {
|
||||
inputs.Add("slurm", func() telegraf.Input {
|
||||
return &Slurm{
|
||||
ResponseTimeout: config.Duration(5 * time.Second),
|
||||
}
|
||||
})
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue