1
0
Fork 0

Adding upstream version 1.34.4.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-24 07:26:29 +02:00
parent e393c3af3f
commit 4978089aab
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
4963 changed files with 677545 additions and 0 deletions

View file

@ -0,0 +1,197 @@
# SLURM Input Plugin
This plugin gather diag, jobs, nodes, partitions and reservation metrics by
leveraging SLURM's REST API as provided by the `slurmrestd` daemon.
This plugin targets the `openapi/v0.0.38` OpenAPI plugin as defined in SLURM's
documentation. That particular plugin should be configured when starting the
`slurmrestd` daemon up. For more information, be sure to check SLURM's
documentation [here][SLURM Doc].
A great wealth of information can also be found on the repository of the
Go module implementing the API client, [pcolladosoto/goslurm][].
[SLURM Doc]: https://slurm.schedmd.com/rest.html
[pcolladosoto/goslurm]: https://github.com/pcolladosoto/goslurm
## Global configuration options <!-- @/docs/includes/plugin_config.md -->
In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
## Configuration
```toml @sample.conf
# Gather SLURM metrics
[[inputs.slurm]]
## Slurmrestd URL. Both http and https can be used as schemas.
url = "http://127.0.0.1:6820"
## Credentials for JWT-based authentication.
# username = "foo"
# token = "topSecret"
## Enabled endpoints
## List of endpoints a user can acquire data from.
## Available values are: diag, jobs, nodes, partitions, reservations.
# enabled_endpoints = ["diag", "jobs", "nodes", "partitions", "reservations"]
## Maximum time to receive a response. If set to 0s, the
## request will not time out.
# response_timeout = "5s"
## Optional TLS Config. Note these options will only
## be taken into account when the scheme specififed on
## the URL parameter is https. They will be silently
## ignored otherwise.
## Set to true/false to enforce TLS being enabled/disabled. If not set,
## enable TLS only if any of the other options are specified.
# tls_enable =
## Trusted root certificates for server
# tls_ca = "/path/to/cafile"
## Used for TLS client certificate authentication
# tls_cert = "/path/to/certfile"
## Used for TLS client certificate authentication
# tls_key = "/path/to/keyfile"
## Password for the key file if it is encrypted
# tls_key_pwd = ""
## Send the specified TLS server name via SNI
# tls_server_name = "kubernetes.example.com"
## Minimal TLS version to accept by the client
# tls_min_version = "TLS12"
## List of ciphers to accept, by default all secure ciphers will be accepted
## See https://pkg.go.dev/crypto/tls#pkg-constants for supported values.
## Use "all", "secure" and "insecure" to add all support ciphers, secure
## suites or insecure suites respectively.
# tls_cipher_suites = ["secure"]
## Renegotiation method, "never", "once" or "freely"
# tls_renegotiation_method = "never"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false
```
## Metrics
Given the great deal of metrics offered by SLURM's API, an attempt has been
done to strike a balance between verbosity and usefulness in terms of the
gathered information.
- slurm_diag
- tags:
- source
- fields:
- server_thread_count
- jobs_canceled
- jobs_submitted
- jobs_started
- jobs_completed
- jobs_failed
- jobs_pending
- jobs_running
- schedule_cycle_last
- schedule_cycle_mean
- bf_queue_len
- bf_queue_len_mean
- bf_active
- slurm_jobs
- tags:
- source
- name
- job_id
- fields:
- state
- state_reason
- partition
- nodes
- node_count
- priority
- nice
- group_id
- command
- standard_output
- standard_error
- standard_input
- current_working_directory
- submit_time
- start_time
- cpus
- tasks
- time_limit
- tres_cpu
- tres_mem
- tres_node
- tres_billing
- slurm_nodes
- tags:
- source
- name
- fields:
- state
- cores
- cpus
- cpu_load
- alloc_cpu
- real_memory
- free_memory
- alloc_memory
- tres_cpu
- tres_mem
- tres_billing
- tres_used_cpu
- tres_used_mem
- weight
- slurmd_version
- architecture
- slurm_partitions
- tags:
- source
- name
- fields:
- state
- total_cpu
- total_nodes
- nodes
- tres_cpu
- tres_mem
- tres_node
- tres_billing
- slurm_reservations
- tags:
- source
- name
- fields:
- core_count
- core_spec_count
- groups
- users
- start_time
- partition
- accounts
- node_count
- node_list
## Example Output
```text
slurm_diag,host=hoth,source=slurm_primary.example.net bf_active=false,bf_queue_len=1i,bf_queue_len_mean=1i,jobs_canceled=0i,jobs_completed=137i,jobs_failed=0i,jobs_pending=0i,jobs_running=100i,jobs_started=137i,jobs_submitted=137i,schedule_cycle_last=27i,schedule_cycle_mean=86i,server_thread_count=3i 1723466497000000000
slurm_jobs,host=hoth,job_id=23160,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.11BCgQ",cpus=2i,current_working_directory="/home/sessiondir/7CQODmQ3uw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmG9JKDmILUkln",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878569i,standard_error="/home/sessiondir/7CQODmQ3uw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmG9JKDmILUkln.comment",standard_input="/dev/null",standard_output="/home/sessiondir/7CQODmQ3uw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmG9JKDmILUkln.comment",start_time=1723354525i,state="RUNNING",state_reason="None",submit_time=1723354525i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=2000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23365,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.yRcFYL",cpus=2i,current_working_directory="/home/sessiondir/LgwNDmTLAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm2BKKDm8bFZsm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878364i,standard_error="/home/sessiondir/LgwNDmTLAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm2BKKDm8bFZsm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/LgwNDmTLAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm2BKKDm8bFZsm.comment",start_time=1723376763i,state="RUNNING",state_reason="None",submit_time=1723376761i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23366,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.5Y9Ngb",cpus=2i,current_working_directory="/home/sessiondir/HFYKDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm3BKKDmiyK3em",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878363i,standard_error="/home/sessiondir/HFYKDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm3BKKDmiyK3em.comment",standard_input="/dev/null",standard_output="/home/sessiondir/HFYKDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm3BKKDmiyK3em.comment",start_time=1723376883i,state="RUNNING",state_reason="None",submit_time=1723376882i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23367,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.NmOqMU",cpus=2i,current_working_directory="/home/sessiondir/nnLLDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm4BKKDmfhjFPn",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878362i,standard_error="/home/sessiondir/nnLLDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm4BKKDmfhjFPn.comment",standard_input="/dev/null",standard_output="/home/sessiondir/nnLLDmULAx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm4BKKDmfhjFPn.comment",start_time=1723376883i,state="RUNNING",state_reason="None",submit_time=1723376882i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23385,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.NNsI08",cpus=2i,current_working_directory="/home/sessiondir/PWvNDmH7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmz7JKDmqgKyRo",group_id=2005i,nice=50i,node_count=1i,nodes="naboo225",partition="atlas",priority=4294878344i,standard_error="/home/sessiondir/PWvNDmH7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmz7JKDmqgKyRo.comment",standard_input="/dev/null",standard_output="/home/sessiondir/PWvNDmH7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmz7JKDmqgKyRo.comment",start_time=1723378725i,state="RUNNING",state_reason="None",submit_time=1723378725i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23386,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.bcmS4h",cpus=2i,current_working_directory="/home/sessiondir/ZNHMDmI7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm27JKDm3Ve66n",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878343i,standard_error="/home/sessiondir/ZNHMDmI7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm27JKDm3Ve66n.comment",standard_input="/dev/null",standard_output="/home/sessiondir/ZNHMDmI7tw5nKG01gq4B3BRpm7wtQmABFKDmbnHPDm27JKDm3Ve66n.comment",start_time=1723379206i,state="RUNNING",state_reason="None",submit_time=1723379205i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23387,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.OgpoQZ",cpus=2i,current_working_directory="/home/sessiondir/qohNDmUqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmMCKKDmzM4Yhn",group_id=2005i,nice=50i,node_count=1i,nodes="naboo222",partition="atlas",priority=4294878342i,standard_error="/home/sessiondir/qohNDmUqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmMCKKDmzM4Yhn.comment",standard_input="/dev/null",standard_output="/home/sessiondir/qohNDmUqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmMCKKDmzM4Yhn.comment",start_time=1723379246i,state="RUNNING",state_reason="None",submit_time=1723379245i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23388,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.xYbxSe",cpus=2i,current_working_directory="/home/sessiondir/u9HODmXqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmWCKKDmRlccYn",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878341i,standard_error="/home/sessiondir/u9HODmXqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmWCKKDmRlccYn.comment",standard_input="/dev/null",standard_output="/home/sessiondir/u9HODmXqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmWCKKDmRlccYn.comment",start_time=1723379326i,state="RUNNING",state_reason="None",submit_time=1723379326i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23389,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.QHtIIm",cpus=2i,current_working_directory="/home/sessiondir/ZLvKDmYqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXCKKDmjp19km",group_id=2005i,nice=50i,node_count=1i,nodes="naboo227",partition="atlas",priority=4294878340i,standard_error="/home/sessiondir/ZLvKDmYqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXCKKDmjp19km.comment",standard_input="/dev/null",standard_output="/home/sessiondir/ZLvKDmYqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXCKKDmjp19km.comment",start_time=1723379326i,state="RUNNING",state_reason="None",submit_time=1723379326i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_jobs,host=hoth,job_id=23393,name=gridjob,source=slurm_primary.example.net command="/tmp/SLURM_job_script.IH19bN",cpus=2i,current_working_directory="/home/sessiondir/YdPODmVqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmSCKKDmrYDOwm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo224",partition="atlas",priority=4294878336i,standard_error="/home/sessiondir/YdPODmVqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmSCKKDmrYDOwm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/YdPODmVqBx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmSCKKDmrYDOwm.comment",start_time=1723379767i,state="RUNNING",state_reason="None",submit_time=1723379766i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=1000,tres_node=1 1723466497000000000
slurm_nodes,host=hoth,name=naboo145,source=slurm_primary.example.net alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=0i,cpus=36i,free_memory=86450i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo146,source=slurm_primary.example.net alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=0i,cpus=36i,free_memory=92148i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo147,source=slurm_primary.example.net alloc_cpu=36i,alloc_memory=45000i,architecture="x86_64",cores=18i,cpu_load=3826i,cpus=36i,free_memory=1607i,real_memory=94793i,slurmd_version="22.05.9",state="allocated",tres_billing=36,tres_cpu=36,tres_mem=94793,tres_used_cpu=36,tres_used_mem=45000,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo216,source=slurm_primary.example.net alloc_cpu=8i,alloc_memory=8000i,architecture="x86_64",cores=4i,cpu_load=891i,cpus=8i,free_memory=17972i,real_memory=31877i,slurmd_version="22.05.9",state="allocated",tres_billing=8,tres_cpu=8,tres_mem=31877,tres_used_cpu=8,tres_used_mem=8000,weight=1i 1723466497000000000
slurm_nodes,host=hoth,name=naboo219,source=slurm_primary.example.net alloc_cpu=16i,alloc_memory=16000i,architecture="x86_64",cores=4i,cpu_load=1382i,cpus=16i,free_memory=15645i,real_memory=31875i,slurmd_version="22.05.9",state="allocated",tres_billing=16,tres_cpu=16,tres_mem=31875,tres_used_cpu=16,tres_used_mem=16000,weight=1i 1723466497000000000
slurm_partitions,host=hoth,name=atlas,source=slurm_primary.example.net nodes="naboo145,naboo146,naboo147,naboo216,naboo219,naboo222,naboo224,naboo225,naboo227,naboo228,naboo229,naboo234,naboo235,naboo236,naboo237,naboo238,naboo239,naboo240,naboo241,naboo242,naboo243",state="UP",total_cpu=632i,total_nodes=21i,tres_billing=632,tres_cpu=632,tres_mem=1415207,tres_node=21 1723466497000000000
```

View file

@ -0,0 +1,46 @@
# Gather SLURM metrics
[[inputs.slurm]]
## Slurmrestd URL. Both http and https can be used as schemas.
url = "http://127.0.0.1:6820"
## Credentials for JWT-based authentication.
# username = "foo"
# token = "topSecret"
## Enabled endpoints
## List of endpoints a user can acquire data from.
## Available values are: diag, jobs, nodes, partitions, reservations.
# enabled_endpoints = ["diag", "jobs", "nodes", "partitions", "reservations"]
## Maximum time to receive a response. If set to 0s, the
## request will not time out.
# response_timeout = "5s"
## Optional TLS Config. Note these options will only
## be taken into account when the scheme specififed on
## the URL parameter is https. They will be silently
## ignored otherwise.
## Set to true/false to enforce TLS being enabled/disabled. If not set,
## enable TLS only if any of the other options are specified.
# tls_enable =
## Trusted root certificates for server
# tls_ca = "/path/to/cafile"
## Used for TLS client certificate authentication
# tls_cert = "/path/to/certfile"
## Used for TLS client certificate authentication
# tls_key = "/path/to/keyfile"
## Password for the key file if it is encrypted
# tls_key_pwd = ""
## Send the specified TLS server name via SNI
# tls_server_name = "kubernetes.example.com"
## Minimal TLS version to accept by the client
# tls_min_version = "TLS12"
## List of ciphers to accept, by default all secure ciphers will be accepted
## See https://pkg.go.dev/crypto/tls#pkg-constants for supported values.
## Use "all", "secure" and "insecure" to add all support ciphers, secure
## suites or insecure suites respectively.
# tls_cipher_suites = ["secure"]
## Renegotiation method, "never", "once" or "freely"
# tls_renegotiation_method = "never"
## Use TLS but skip chain & host verification
# insecure_skip_verify = false

View file

@ -0,0 +1,23 @@
# Gather SLURM metrics
[[inputs.slurm]]
## Slurmrestd URL. Both http and https can be used as schemas.
url = "http://127.0.0.1:6820"
## Credentials for JWT-based authentication.
# username = "foo"
# token = "topSecret"
## Enabled endpoints
## List of endpoints a user can acquire data from.
## Available values are: diag, jobs, nodes, partitions, reservations.
# enabled_endpoints = ["diag", "jobs", "nodes", "partitions", "reservations"]
## Maximum time to receive a response. If set to 0s, the
## request will not time out.
# response_timeout = "5s"
## Optional TLS Config. Note these options will only
## be taken into account when the scheme specififed on
## the URL parameter is https. They will be silently
## ignored otherwise.
{{template "/plugins/common/tls/client.conf"}}

View file

@ -0,0 +1,476 @@
//go:generate ../../../tools/config_includer/generator
//go:generate ../../../tools/readme_config_includer/generator
package slurm
import (
"context"
_ "embed"
"errors"
"fmt"
"net/http"
"net/url"
"strconv"
"strings"
"time"
goslurm "github.com/pcolladosoto/goslurm/v0038"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/common/tls"
"github.com/influxdata/telegraf/plugins/inputs"
)
//go:embed sample.conf
var sampleConfig string
type Slurm struct {
URL string `toml:"url"`
Username string `toml:"username"`
Token string `toml:"token"`
EnabledEndpoints []string `toml:"enabled_endpoints"`
ResponseTimeout config.Duration `toml:"response_timeout"`
Log telegraf.Logger `toml:"-"`
tls.ClientConfig
client *goslurm.APIClient
baseURL *url.URL
endpointMap map[string]bool
}
func (*Slurm) SampleConfig() string {
return sampleConfig
}
func (s *Slurm) Init() error {
if len(s.EnabledEndpoints) == 0 {
s.EnabledEndpoints = []string{"diag", "jobs", "nodes", "partitions", "reservations"}
}
s.endpointMap = make(map[string]bool, len(s.EnabledEndpoints))
for _, endpoint := range s.EnabledEndpoints {
switch e := strings.ToLower(endpoint); e {
case "diag", "jobs", "nodes", "partitions", "reservations":
s.endpointMap[e] = true
default:
return fmt.Errorf("unknown endpoint %q", endpoint)
}
}
if s.URL == "" {
return errors.New("empty URL provided")
}
u, err := url.Parse(s.URL)
if err != nil {
return err
}
if u.Hostname() == "" {
return fmt.Errorf("empty hostname for url %q", s.URL)
}
s.baseURL = u
if u.Scheme != "http" && u.Scheme != "https" {
return fmt.Errorf("invalid scheme %q", u.Scheme)
}
tlsCfg, err := s.ClientConfig.TLSConfig()
if err != nil {
return err
}
if u.Scheme == "http" && tlsCfg != nil {
s.Log.Warn("non-empty TLS configuration for a URL with an http scheme. Ignoring it...")
tlsCfg = nil
}
configuration := goslurm.NewConfiguration()
configuration.Host = u.Host
configuration.Scheme = u.Scheme
configuration.UserAgent = internal.ProductToken()
configuration.HTTPClient = &http.Client{
Transport: &http.Transport{
TLSClientConfig: tlsCfg,
},
Timeout: time.Duration(s.ResponseTimeout),
}
s.client = goslurm.NewAPIClient(configuration)
return nil
}
func (s *Slurm) Gather(acc telegraf.Accumulator) (err error) {
auth := context.WithValue(
context.Background(),
goslurm.ContextAPIKeys,
map[string]goslurm.APIKey{
"user": {Key: s.Username},
"token": {Key: s.Token},
},
)
if s.endpointMap["diag"] {
diagResp, respRaw, err := s.client.SlurmAPI.SlurmV0038Diag(auth).Execute()
if err != nil {
return fmt.Errorf("error getting diag: %w", err)
}
if diag, ok := diagResp.GetStatisticsOk(); ok {
s.gatherDiagMetrics(acc, diag)
}
respRaw.Body.Close()
}
if s.endpointMap["jobs"] {
jobsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetJobs(auth).Execute()
if err != nil {
return fmt.Errorf("error getting jobs: %w", err)
}
if jobs, ok := jobsResp.GetJobsOk(); ok {
s.gatherJobsMetrics(acc, jobs)
}
respRaw.Body.Close()
}
if s.endpointMap["nodes"] {
nodesResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetNodes(auth).Execute()
if err != nil {
return fmt.Errorf("error getting nodes: %w", err)
}
if nodes, ok := nodesResp.GetNodesOk(); ok {
s.gatherNodesMetrics(acc, nodes)
}
respRaw.Body.Close()
}
if s.endpointMap["partitions"] {
partitionsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetPartitions(auth).Execute()
if err != nil {
return fmt.Errorf("error getting partitions: %w", err)
}
if partitions, ok := partitionsResp.GetPartitionsOk(); ok {
s.gatherPartitionsMetrics(acc, partitions)
}
respRaw.Body.Close()
}
if s.endpointMap["reservations"] {
reservationsResp, respRaw, err := s.client.SlurmAPI.SlurmV0038GetReservations(auth).Execute()
if err != nil {
return fmt.Errorf("error getting reservations: %w", err)
}
if reservations, ok := reservationsResp.GetReservationsOk(); ok {
s.gatherReservationsMetrics(acc, reservations)
}
respRaw.Body.Close()
}
return nil
}
func parseTres(tres string) map[string]interface{} {
tresKVs := strings.Split(tres, ",")
parsedValues := make(map[string]interface{}, len(tresKVs))
for _, tresVal := range tresKVs {
parsedTresVal := strings.Split(tresVal, "=")
if len(parsedTresVal) != 2 {
continue
}
tag := parsedTresVal[0]
val := parsedTresVal[1]
var factor float64 = 1
if tag == "mem" {
var ok bool
factor, ok = map[string]float64{
"K": 1.0 / 1024.0,
"M": 1,
"G": 1024,
"T": 1024 * 1024,
"P": 1024 * 1024 * 1024,
}[strings.ToUpper(val[len(val)-1:])]
if !ok {
continue
}
val = val[:len(val)-1]
}
parsedFloat, err := strconv.ParseFloat(val, 64)
if err == nil {
parsedValues[tag] = parsedFloat * factor
continue
}
parsedValues[tag] = val
}
return parsedValues
}
func (s *Slurm) gatherDiagMetrics(acc telegraf.Accumulator, diag *goslurm.V0038DiagStatistics) {
records := make(map[string]interface{}, 13)
tags := map[string]string{"source": s.baseURL.Hostname()}
if int32Ptr, ok := diag.GetServerThreadCountOk(); ok {
records["server_thread_count"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsCanceledOk(); ok {
records["jobs_canceled"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsSubmittedOk(); ok {
records["jobs_submitted"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsStartedOk(); ok {
records["jobs_started"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsCompletedOk(); ok {
records["jobs_completed"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsFailedOk(); ok {
records["jobs_failed"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsPendingOk(); ok {
records["jobs_pending"] = *int32Ptr
}
if int32Ptr, ok := diag.GetJobsRunningOk(); ok {
records["jobs_running"] = *int32Ptr
}
if int32Ptr, ok := diag.GetScheduleCycleLastOk(); ok {
records["schedule_cycle_last"] = *int32Ptr
}
if int32Ptr, ok := diag.GetScheduleCycleMeanOk(); ok {
records["schedule_cycle_mean"] = *int32Ptr
}
if int32Ptr, ok := diag.GetBfQueueLenOk(); ok {
records["bf_queue_len"] = *int32Ptr
}
if int32Ptr, ok := diag.GetBfQueueLenMeanOk(); ok {
records["bf_queue_len_mean"] = *int32Ptr
}
if boolPtr, ok := diag.GetBfActiveOk(); ok {
records["bf_active"] = *boolPtr
}
acc.AddFields("slurm_diag", records, tags)
}
func (s *Slurm) gatherJobsMetrics(acc telegraf.Accumulator, jobs []goslurm.V0038JobResponseProperties) {
for i := range jobs {
records := make(map[string]interface{}, 19)
tags := make(map[string]string, 3)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := jobs[i].GetNameOk(); ok {
tags["name"] = *strPtr
}
if int32Ptr, ok := jobs[i].GetJobIdOk(); ok {
tags["job_id"] = strconv.Itoa(int(*int32Ptr))
}
if strPtr, ok := jobs[i].GetJobStateOk(); ok {
records["state"] = *strPtr
}
if strPtr, ok := jobs[i].GetStateReasonOk(); ok {
records["state_reason"] = *strPtr
}
if strPtr, ok := jobs[i].GetPartitionOk(); ok {
records["partition"] = *strPtr
}
if strPtr, ok := jobs[i].GetNodesOk(); ok {
records["nodes"] = *strPtr
}
if int32Ptr, ok := jobs[i].GetNodeCountOk(); ok {
records["node_count"] = *int32Ptr
}
if int64Ptr, ok := jobs[i].GetPriorityOk(); ok {
records["priority"] = *int64Ptr
}
if int32Ptr, ok := jobs[i].GetNiceOk(); ok {
records["nice"] = *int32Ptr
}
if int32Ptr, ok := jobs[i].GetGroupIdOk(); ok {
records["group_id"] = *int32Ptr
}
if strPtr, ok := jobs[i].GetCommandOk(); ok {
records["command"] = *strPtr
}
if strPtr, ok := jobs[i].GetStandardOutputOk(); ok {
records["standard_output"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if strPtr, ok := jobs[i].GetStandardErrorOk(); ok {
records["standard_error"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if strPtr, ok := jobs[i].GetStandardInputOk(); ok {
records["standard_input"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if strPtr, ok := jobs[i].GetCurrentWorkingDirectoryOk(); ok {
records["current_working_directory"] = strings.ReplaceAll(*strPtr, "\\", "")
}
if int64Ptr, ok := jobs[i].GetSubmitTimeOk(); ok {
records["submit_time"] = *int64Ptr
}
if int64Ptr, ok := jobs[i].GetStartTimeOk(); ok {
records["start_time"] = *int64Ptr
}
if int32Ptr, ok := jobs[i].GetCpusOk(); ok {
records["cpus"] = *int32Ptr
}
if int32Ptr, ok := jobs[i].GetTasksOk(); ok {
records["tasks"] = *int32Ptr
}
if int64Ptr, ok := jobs[i].GetTimeLimitOk(); ok {
records["time_limit"] = *int64Ptr
}
if strPtr, ok := jobs[i].GetTresReqStrOk(); ok {
for k, v := range parseTres(*strPtr) {
records["tres_"+k] = v
}
}
acc.AddFields("slurm_jobs", records, tags)
}
}
func (s *Slurm) gatherNodesMetrics(acc telegraf.Accumulator, nodes []goslurm.V0038Node) {
for _, node := range nodes {
records := make(map[string]interface{}, 13)
tags := make(map[string]string, 2)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := node.GetNameOk(); ok {
tags["name"] = *strPtr
}
if strPtr, ok := node.GetStateOk(); ok {
records["state"] = *strPtr
}
if int32Ptr, ok := node.GetCoresOk(); ok {
records["cores"] = *int32Ptr
}
if int32Ptr, ok := node.GetCpusOk(); ok {
records["cpus"] = *int32Ptr
}
if int64Ptr, ok := node.GetCpuLoadOk(); ok {
records["cpu_load"] = *int64Ptr
}
if int64Ptr, ok := node.GetAllocCpusOk(); ok {
records["alloc_cpu"] = *int64Ptr
}
if int32Ptr, ok := node.GetRealMemoryOk(); ok {
records["real_memory"] = *int32Ptr
}
if int32Ptr, ok := node.GetFreeMemoryOk(); ok {
records["free_memory"] = *int32Ptr
}
if int64Ptr, ok := node.GetAllocMemoryOk(); ok {
records["alloc_memory"] = *int64Ptr
}
if strPtr, ok := node.GetTresOk(); ok {
for k, v := range parseTres(*strPtr) {
records["tres_"+k] = v
}
}
if strPtr, ok := node.GetTresUsedOk(); ok {
for k, v := range parseTres(*strPtr) {
records["tres_used_"+k] = v
}
}
if int32Ptr, ok := node.GetWeightOk(); ok {
records["weight"] = *int32Ptr
}
if strPtr, ok := node.GetSlurmdVersionOk(); ok {
records["slurmd_version"] = *strPtr
}
if strPtr, ok := node.GetArchitectureOk(); ok {
records["architecture"] = *strPtr
}
acc.AddFields("slurm_nodes", records, tags)
}
}
func (s *Slurm) gatherPartitionsMetrics(acc telegraf.Accumulator, partitions []goslurm.V0038Partition) {
for _, partition := range partitions {
records := make(map[string]interface{}, 5)
tags := make(map[string]string, 2)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := partition.GetNameOk(); ok {
tags["name"] = *strPtr
}
if strPtr, ok := partition.GetStateOk(); ok {
records["state"] = *strPtr
}
if int32Ptr, ok := partition.GetTotalCpusOk(); ok {
records["total_cpu"] = *int32Ptr
}
if int32Ptr, ok := partition.GetTotalNodesOk(); ok {
records["total_nodes"] = *int32Ptr
}
if strPtr, ok := partition.GetNodesOk(); ok {
records["nodes"] = *strPtr
}
if strPtr, ok := partition.GetTresOk(); ok {
for k, v := range parseTres(*strPtr) {
records["tres_"+k] = v
}
}
acc.AddFields("slurm_partitions", records, tags)
}
}
func (s *Slurm) gatherReservationsMetrics(acc telegraf.Accumulator, reservations []goslurm.V0038Reservation) {
for _, reservation := range reservations {
records := make(map[string]interface{}, 9)
tags := make(map[string]string, 2)
tags["source"] = s.baseURL.Hostname()
if strPtr, ok := reservation.GetNameOk(); ok {
tags["name"] = *strPtr
}
if int32Ptr, ok := reservation.GetCoreCountOk(); ok {
records["core_count"] = *int32Ptr
}
if int32Ptr, ok := reservation.GetCoreSpecCntOk(); ok {
records["core_spec_count"] = *int32Ptr
}
if strPtr, ok := reservation.GetGroupsOk(); ok {
records["groups"] = *strPtr
}
if strPtr, ok := reservation.GetUsersOk(); ok {
records["users"] = *strPtr
}
if int32Ptr, ok := reservation.GetStartTimeOk(); ok {
records["start_time"] = *int32Ptr
}
if strPtr, ok := reservation.GetPartitionOk(); ok {
records["partition"] = *strPtr
}
if strPtr, ok := reservation.GetAccountsOk(); ok {
records["accounts"] = *strPtr
}
if int32Ptr, ok := reservation.GetNodeCountOk(); ok {
records["node_count"] = *int32Ptr
}
if strPtr, ok := reservation.GetNodeListOk(); ok {
records["node_list"] = *strPtr
}
acc.AddFields("slurm_reservations", records, tags)
}
}
func init() {
inputs.Add("slurm", func() telegraf.Input {
return &Slurm{
ResponseTimeout: config.Duration(5 * time.Second),
}
})
}

View file

@ -0,0 +1,161 @@
package slurm
import (
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"
"github.com/stretchr/testify/require"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/plugins/parsers/influx"
"github.com/influxdata/telegraf/testutil"
)
func TestGoodURLs(t *testing.T) {
tests := []struct {
name string
url string
}{
{"http", "http://example.com:6820"},
{"https", "https://example.com:6820"},
{"http no port", "http://example.com"},
{"https no port", "https://example.com"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
plugin := Slurm{
URL: tt.url,
}
require.NoError(t, plugin.Init())
})
}
}
func TestWrongURLs(t *testing.T) {
tests := []struct {
name string
url string
}{
{"wrong http scheme", "httpp://example.com:6820"},
{"wrong https scheme", "httpss://example.com:6820"},
{"empty url", ""},
{"empty hostname", "http://:6820"},
{"only scheme", "http://"},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
plugin := Slurm{
URL: tt.url,
}
require.Error(t, plugin.Init())
})
}
}
func TestWrongEndpoints(t *testing.T) {
tests := []struct {
name string
enabledEndpoints []string
}{
{"empty endpoint", []string{"diag", "", "jobs"}},
{"mistyped endpoint", []string{"diagg", "jobs", "partitions"}},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
plugin := Slurm{
URL: "http://example.net",
EnabledEndpoints: tt.enabledEndpoints,
}
require.Error(t, plugin.Init())
})
}
}
func TestCases(t *testing.T) {
entries, err := os.ReadDir("testcases")
require.NoError(t, err)
for _, entry := range entries {
if !entry.IsDir() {
continue
}
t.Run(entry.Name(), func(t *testing.T) {
testcasePath := filepath.Join("testcases", entry.Name())
responsesPath := filepath.Join(testcasePath, "responses")
expectedFilename := filepath.Join(testcasePath, "expected.out")
configFilename := filepath.Join(testcasePath, "telegraf.conf")
responses, err := os.ReadDir(responsesPath)
require.NoError(t, err)
pathToResponse := map[string][]byte{}
for _, response := range responses {
if response.IsDir() {
continue
}
fName := response.Name()
buf, err := os.ReadFile(filepath.Join(responsesPath, fName))
require.NoError(t, err)
pathToResponse[strings.TrimSuffix(fName, filepath.Ext(fName))] = buf
}
// Prepare the influx parser for expectations
parser := &influx.Parser{}
require.NoError(t, parser.Init())
// Read expected values, if any
var expected []telegraf.Metric
if _, err := os.Stat(expectedFilename); err == nil {
var err error
expected, err = testutil.ParseMetricsFromFile(expectedFilename, parser)
require.NoError(t, err)
}
ts := httptest.NewServer(http.NotFoundHandler())
defer ts.Close()
ts.Config.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
resp, ok := pathToResponse[strings.TrimPrefix(r.URL.Path, "/slurm/v0.0.38/")]
if !ok {
w.WriteHeader(http.StatusInternalServerError)
t.Errorf("Expected to have path to response: %s", r.URL.Path)
return
}
w.Header().Add("Content-Type", "application/json")
if _, err := w.Write(resp); err != nil {
w.WriteHeader(http.StatusInternalServerError)
t.Error(err)
return
}
w.WriteHeader(http.StatusOK)
})
// Load the test-specific configuration
cfg := config.NewConfig()
cfg.Agent.Quiet = true
require.NoError(t, cfg.LoadConfig(configFilename))
require.Len(t, cfg.Inputs, 1)
// Instantiate the plugin. As seen on NewConfig's documentation,
// parsing the configuration will instantiate the plugins, so that
// we only need to assert the plugin's type!
plugin := cfg.Inputs[0].Input.(*Slurm)
plugin.URL = "http://" + ts.Listener.Addr().String()
plugin.Log = testutil.Logger{}
require.NoError(t, plugin.Init())
var acc testutil.Accumulator
require.NoError(t, plugin.Gather(&acc))
actual := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, expected, actual, testutil.SortMetrics(), testutil.IgnoreTime())
})
}
}

View file

@ -0,0 +1,11 @@
slurm_diag,source=127.0.0.1 bf_active=false,bf_queue_len=1i,bf_queue_len_mean=1i,jobs_canceled=0i,jobs_completed=287i,jobs_failed=1i,jobs_pending=0i,jobs_running=100i,jobs_started=287i,jobs_submitted=287i,schedule_cycle_last=298i,schedule_cycle_mean=137i,server_thread_count=3i 1723464650000000000
slurm_jobs,job_id=20464,name=gridjob,source=127.0.0.1 command="/tmp/SLURM_job_script.OjQEIH",cpus=2i,current_working_directory="/home/sessiondir/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo222",partition="atlas",priority=4294881265i,standard_error="/home/sessiondir/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",start_time=1722989851i,state="RUNNING",state_reason="None",submit_time=1722989851i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=2000,tres_node=1 1723464650000000000
slurm_jobs,job_id=20468,name=gridjob,source=127.0.0.1 command="/tmp/SLURM_job_script.XTwtdj",cpus=2i,current_working_directory="/home/sessiondir/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n",group_id=2005i,nice=50i,node_count=1i,nodes="naboo222",partition="atlas",priority=4294881261i,standard_error="/home/sessiondir/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",standard_input="/dev/null",standard_output="/home/sessiondir/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",start_time=1722990772i,state="RUNNING",state_reason="None",submit_time=1722990772i,tasks=1i,time_limit=3600i,tres_billing=1,tres_cpu=1,tres_mem=2000,tres_node=1 1723464650000000000
slurm_jobs,job_id=23772,name=gridjob,source=127.0.0.1 command="/tmp/SLURM_job_script.8PMmVe",cpus=8i,current_working_directory="/home/sessiondir/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm",group_id=2005i,nice=50i,node_count=1i,nodes="naboo147",partition="atlas",priority=4294877957i,standard_error="/home/sessiondir/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",standard_input="/dev/null",standard_output="/home/sessiondir/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",start_time=1723457333i,state="COMPLETED",state_reason="None",submit_time=1723457333i,tasks=8i,time_limit=3600i,tres_billing=8,tres_cpu=8,tres_mem=16000,tres_node=1 1723464650000000000
slurm_nodes,name=naboo145,source=127.0.0.1 alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=27i,cpus=36i,free_memory=86423i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723464650000000000
slurm_nodes,name=naboo146,source=127.0.0.1 alloc_cpu=0i,alloc_memory=0i,architecture="x86_64",cores=18i,cpu_load=0i,cpus=36i,free_memory=92151i,real_memory=94791i,slurmd_version="22.05.9",state="idle",tres_billing=36,tres_cpu=36,tres_mem=94791,weight=1i 1723464650000000000
slurm_nodes,name=naboo147,source=127.0.0.1 alloc_cpu=36i,alloc_memory=56000i,architecture="x86_64",cores=18i,cpu_load=2969i,cpus=36i,free_memory=10908i,real_memory=94793i,slurmd_version="22.05.9",state="allocated",tres_billing=36,tres_cpu=36,tres_mem=94793,tres_used_cpu=36,tres_used_mem=56000,weight=1i 1723464650000000000
slurm_partitions,name=atlas,source=127.0.0.1 nodes="naboo145,naboo146,naboo147,naboo216,naboo219,naboo222,naboo224,naboo225,naboo227,naboo228,naboo229,naboo234,naboo235,naboo236,naboo237,naboo238,naboo239,naboo240,naboo241,naboo242,naboo243",state="UP",total_cpu=632i,total_nodes=21i,tres_billing=632,tres_cpu=632,tres_mem=1415207,tres_node=21 1723464650000000000

View file

@ -0,0 +1,224 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"statistics": {
"rpcs_by_message_type": [
{
"message_type": "REQUEST_JOB_INFO",
"type_id": 2003,
"count": 73587,
"average_time": 658,
"total_time": 48479000
},
{
"message_type": "REQUEST_PARTITION_INFO",
"type_id": 2009,
"count": 158967,
"average_time": 101,
"total_time": 16185440
},
{
"message_type": "MESSAGE_NODE_REGISTRATION_STATUS",
"type_id": 1002,
"count": 18690,
"average_time": 137,
"total_time": 2566758
},
{
"message_type": "REQUEST_COMPLETE_BATCH_SCRIPT",
"type_id": 5018,
"count": 12233,
"average_time": 486,
"total_time": 5946490
},
{
"message_type": "REQUEST_AUTH_TOKEN",
"type_id": 5039,
"count": 36,
"average_time": 291,
"total_time": 10489
},
{
"message_type": "REQUEST_BUILD_INFO",
"type_id": 2001,
"count": 28201,
"average_time": 194,
"total_time": 5486061
},
{
"message_type": "REQUEST_PING",
"type_id": 1008,
"count": 28201,
"average_time": 103,
"total_time": 2925195
},
{
"message_type": "REQUEST_NODE_INFO",
"type_id": 2007,
"count": 85379,
"average_time": 175,
"total_time": 15007960
},
{
"message_type": "REQUEST_FED_INFO",
"type_id": 2049,
"count": 24466,
"average_time": 109,
"total_time": 2681655
},
{
"message_type": "REQUEST_JOB_INFO_SINGLE",
"type_id": 2021,
"count": 24466,
"average_time": 121,
"total_time": 2963320
},
{
"message_type": "REQUEST_SUBMIT_BATCH_JOB",
"type_id": 4003,
"count": 12233,
"average_time": 6504,
"total_time": 79574600
},
{
"message_type": "REQUEST_STATS_INFO",
"type_id": 2035,
"count": 1040,
"average_time": 61,
"total_time": 64431
},
{
"message_type": "MESSAGE_EPILOG_COMPLETE",
"type_id": 6012,
"count": 40,
"average_time": 86,
"total_time": 3455
},
{
"message_type": "REQUEST_RESERVATION_INFO",
"type_id": 2024,
"count": 1017,
"average_time": 47,
"total_time": 48788
},
{
"message_type": "REQUEST_LICENSE_INFO",
"type_id": 1021,
"count": 42,
"average_time": 43,
"total_time": 1823
},
{
"message_type": "REQUEST_UPDATE_NODE",
"type_id": 3002,
"count": 2,
"average_time": 415,
"total_time": 830
}
],
"rpcs_by_user": [
{
"user": "root",
"user_id": 0,
"count": 456365,
"average_time": 224,
"total_time": 102371523
},
{
"user": "atl001",
"user_id": 2006,
"count": 11699,
"average_time": 6611,
"total_time": 77353396
},
{
"user": "atl002",
"user_id": 2007,
"count": 120,
"average_time": 3684,
"total_time": 442106
},
{
"user": "ops001",
"user_id": 18006,
"count": 298,
"average_time": 4447,
"total_time": 1325496
},
{
"user": "ops003",
"user_id": 18008,
"count": 58,
"average_time": 3732,
"total_time": 216488
},
{
"user": "ops002",
"user_id": 18007,
"count": 58,
"average_time": 4088,
"total_time": 237114
},
{
"user": "99",
"user_id": 99,
"count": 2,
"average_time": 86,
"total_time": 172
}
],
"parts_packed": 1,
"req_time": 1723103198,
"req_time_start": 1723075200,
"server_thread_count": 3,
"agent_queue_size": 0,
"agent_count": 0,
"agent_thread_count": 0,
"dbd_agent_queue_size": 0,
"gettimeofday_latency": 21,
"schedule_cycle_max": 1116,
"schedule_cycle_last": 298,
"schedule_cycle_total": 960,
"schedule_cycle_mean": 137,
"schedule_cycle_mean_depth": 0,
"schedule_cycle_per_minute": 2,
"schedule_queue_length": 1,
"jobs_submitted": 287,
"jobs_started": 287,
"jobs_completed": 287,
"jobs_canceled": 0,
"jobs_failed": 1,
"jobs_pending": 0,
"jobs_running": 100,
"job_states_ts": 1723103172,
"bf_backfilled_jobs": 1626,
"bf_last_backfilled_jobs": 14,
"bf_backfilled_het_jobs": 0,
"bf_cycle_counter": 12,
"bf_cycle_mean": 440,
"bf_depth_mean": 1,
"bf_depth_mean_try": 1,
"bf_cycle_last": 387,
"bf_cycle_max": 811,
"bf_queue_len": 1,
"bf_queue_len_mean": 1,
"bf_table_size": 1,
"bf_table_size_mean": 1,
"bf_when_last_cycle": 1723102514,
"bf_active": false
}
}

View file

@ -0,0 +1,448 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"jobs": [
{
"account": "",
"accrue_time": 1722989851,
"admin_comment": "",
"array_job_id": 0,
"array_task_id": null,
"array_max_tasks": 0,
"array_task_string": "",
"association_id": 0,
"batch_features": "",
"batch_flag": true,
"batch_host": "naboo222",
"flags": [
"JOB_WAS_RUNNING",
"JOB_MEM_SET"
],
"burst_buffer": "",
"burst_buffer_state": "",
"cluster": "local",
"cluster_features": "",
"command": "\/tmp\/SLURM_job_script.OjQEIH",
"comment": "",
"container": "",
"contiguous": false,
"core_spec": null,
"thread_spec": null,
"cores_per_socket": null,
"billable_tres": 2.0,
"cpus_per_task": null,
"cpu_frequency_minimum": null,
"cpu_frequency_maximum": null,
"cpu_frequency_governor": null,
"cpus_per_tres": "",
"deadline": 0,
"delay_boot": 0,
"dependency": "",
"derived_exit_code": 0,
"eligible_time": 1722989851,
"end_time": 1723205851,
"excluded_nodes": "",
"exit_code": 0,
"features": "",
"federation_origin": "",
"federation_siblings_active": "",
"federation_siblings_viable": "",
"gres_detail": [
],
"group_id": 2005,
"group_name": "atlas",
"job_id": 20464,
"job_resources": {
"nodes": "naboo222",
"allocated_hosts": 1,
"allocated_nodes": [
{
"sockets": {
"0": {
"cores": {
"0": "allocated"
}
}
},
"nodename": "naboo222",
"cpus_used": 0,
"memory_used": 0,
"memory_allocated": 4000
}
]
},
"job_state": "RUNNING",
"last_sched_evaluation": 1722989851,
"licenses": "",
"max_cpus": 0,
"max_nodes": 0,
"mcs_label": "",
"memory_per_tres": "",
"name": "gridjob",
"nodes": "naboo222",
"nice": 50,
"tasks_per_core": null,
"tasks_per_node": 0,
"tasks_per_socket": null,
"tasks_per_board": 0,
"cpus": 2,
"node_count": 1,
"tasks": 1,
"het_job_id": 0,
"het_job_id_set": "",
"het_job_offset": 0,
"partition": "atlas",
"prefer": "",
"memory_per_node": null,
"memory_per_cpu": 2000,
"minimum_cpus_per_node": 1,
"minimum_tmp_disk_per_node": 0,
"preempt_time": 0,
"pre_sus_time": 0,
"priority": 4294881265,
"profile": null,
"qos": "",
"reboot": false,
"required_nodes": "",
"requeue": false,
"resize_time": 0,
"restart_cnt": 0,
"resv_name": "",
"shared": null,
"show_flags": [
"SHOW_ALL",
"SHOW_DETAIL",
"SHOW_LOCAL"
],
"sockets_per_board": 0,
"sockets_per_node": null,
"start_time": 1722989851,
"state_description": "",
"state_reason": "None",
"standard_error": "\/home\/sessiondir\/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",
"standard_input": "\/dev\/null",
"standard_output": "\/home\/sessiondir\/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm.comment",
"submit_time": 1722989851,
"suspend_time": 0,
"system_comment": "",
"time_limit": 3600,
"time_minimum": 0,
"threads_per_core": null,
"tres_bind": "",
"tres_freq": "",
"tres_per_job": "",
"tres_per_node": "",
"tres_per_socket": "",
"tres_per_task": "",
"tres_req_str": "cpu=1,mem=2000M,node=1,billing=1",
"tres_alloc_str": "cpu=2,mem=4000M,node=1,billing=2",
"user_id": 2006,
"user_name": "atl001",
"wckey": "",
"current_working_directory": "\/home\/sessiondir\/zv6NDmqNcv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmXSJKDmFRYcQm"
},
{
"account": "",
"accrue_time": 1722990772,
"admin_comment": "",
"array_job_id": 0,
"array_task_id": null,
"array_max_tasks": 0,
"array_task_string": "",
"association_id": 0,
"batch_features": "",
"batch_flag": true,
"batch_host": "naboo222",
"flags": [
"JOB_WAS_RUNNING",
"JOB_MEM_SET"
],
"burst_buffer": "",
"burst_buffer_state": "",
"cluster": "local",
"cluster_features": "",
"command": "\/tmp\/SLURM_job_script.XTwtdj",
"comment": "",
"container": "",
"contiguous": false,
"core_spec": null,
"thread_spec": null,
"cores_per_socket": null,
"billable_tres": 2.0,
"cpus_per_task": null,
"cpu_frequency_minimum": null,
"cpu_frequency_maximum": null,
"cpu_frequency_governor": null,
"cpus_per_tres": "",
"deadline": 0,
"delay_boot": 0,
"dependency": "",
"derived_exit_code": 0,
"eligible_time": 1722990772,
"end_time": 1723206772,
"excluded_nodes": "",
"exit_code": 0,
"features": "",
"federation_origin": "",
"federation_siblings_active": "",
"federation_siblings_viable": "",
"gres_detail": [
],
"group_id": 2005,
"group_name": "atlas",
"job_id": 20468,
"job_resources": {
"nodes": "naboo222",
"allocated_hosts": 1,
"allocated_nodes": [
{
"sockets": {
"1": {
"cores": {
"2": "allocated"
}
}
},
"nodename": "naboo222",
"cpus_used": 0,
"memory_used": 0,
"memory_allocated": 4000
}
]
},
"job_state": "RUNNING",
"last_sched_evaluation": 1722990772,
"licenses": "",
"max_cpus": 0,
"max_nodes": 0,
"mcs_label": "",
"memory_per_tres": "",
"name": "gridjob",
"nodes": "naboo222",
"nice": 50,
"tasks_per_core": null,
"tasks_per_node": 0,
"tasks_per_socket": null,
"tasks_per_board": 0,
"cpus": 2,
"node_count": 1,
"tasks": 1,
"het_job_id": 0,
"het_job_id_set": "",
"het_job_offset": 0,
"partition": "atlas",
"prefer": "",
"memory_per_node": null,
"memory_per_cpu": 2000,
"minimum_cpus_per_node": 1,
"minimum_tmp_disk_per_node": 0,
"preempt_time": 0,
"pre_sus_time": 0,
"priority": 4294881261,
"profile": null,
"qos": "",
"reboot": false,
"required_nodes": "",
"requeue": false,
"resize_time": 0,
"restart_cnt": 0,
"resv_name": "",
"shared": null,
"show_flags": [
"SHOW_ALL",
"SHOW_DETAIL",
"SHOW_LOCAL"
],
"sockets_per_board": 0,
"sockets_per_node": null,
"start_time": 1722990772,
"state_description": "",
"state_reason": "None",
"standard_error": "\/home\/sessiondir\/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",
"standard_input": "\/dev\/null",
"standard_output": "\/home\/sessiondir\/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n.comment",
"submit_time": 1722990772,
"suspend_time": 0,
"system_comment": "",
"time_limit": 3600,
"time_minimum": 0,
"threads_per_core": null,
"tres_bind": "",
"tres_freq": "",
"tres_per_job": "",
"tres_per_node": "",
"tres_per_socket": "",
"tres_per_task": "",
"tres_req_str": "cpu=1,mem=2000M,node=1,billing=1",
"tres_alloc_str": "cpu=2,mem=4000M,node=1,billing=2",
"user_id": 2006,
"user_name": "atl001",
"wckey": "",
"current_working_directory": "\/home\/sessiondir\/ljvLDmQccv5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmcSJKDmor4c2n"
},
{
"account": "",
"accrue_time": 1723457333,
"admin_comment": "",
"array_job_id": 0,
"array_task_id": null,
"array_max_tasks": 0,
"array_task_string": "",
"association_id": 0,
"batch_features": "",
"batch_flag": true,
"batch_host": "naboo147",
"flags": [
"TRES_STR_CALC",
"JOB_MEM_SET"
],
"burst_buffer": "",
"burst_buffer_state": "",
"cluster": "local",
"cluster_features": "",
"command": "\/tmp\/SLURM_job_script.8PMmVe",
"comment": "",
"container": "",
"contiguous": false,
"core_spec": null,
"thread_spec": null,
"cores_per_socket": null,
"billable_tres": 8.0,
"cpus_per_task": null,
"cpu_frequency_minimum": null,
"cpu_frequency_maximum": null,
"cpu_frequency_governor": null,
"cpus_per_tres": "",
"deadline": 0,
"delay_boot": 0,
"dependency": "",
"derived_exit_code": 0,
"eligible_time": 1723457333,
"end_time": 1723463525,
"excluded_nodes": "",
"exit_code": 0,
"features": "",
"federation_origin": "",
"federation_siblings_active": "",
"federation_siblings_viable": "",
"gres_detail": [
],
"group_id": 2005,
"group_name": "atlas",
"job_id": 23772,
"job_resources": {
"nodes": "naboo147",
"allocated_hosts": 1,
"allocated_nodes": [
{
"sockets": {
"0": {
"cores": {
"3": "allocated",
"10": "allocated",
"12": "allocated",
"13": "allocated"
}
},
"1": {
"cores": {
"8": "allocated",
"11": "allocated",
"12": "allocated",
"13": "allocated"
}
}
},
"nodename": "naboo147",
"cpus_used": 0,
"memory_used": 0,
"memory_allocated": 16000
}
]
},
"job_state": "COMPLETED",
"last_sched_evaluation": 1723457333,
"licenses": "",
"max_cpus": 0,
"max_nodes": 0,
"mcs_label": "",
"memory_per_tres": "",
"name": "gridjob",
"nodes": "naboo147",
"nice": 50,
"tasks_per_core": null,
"tasks_per_node": 8,
"tasks_per_socket": null,
"tasks_per_board": 0,
"cpus": 8,
"node_count": 1,
"tasks": 8,
"het_job_id": 0,
"het_job_id_set": "",
"het_job_offset": 0,
"partition": "atlas",
"prefer": "",
"memory_per_node": null,
"memory_per_cpu": 2000,
"minimum_cpus_per_node": 8,
"minimum_tmp_disk_per_node": 0,
"preempt_time": 0,
"pre_sus_time": 0,
"priority": 4294877957,
"profile": null,
"qos": "",
"reboot": false,
"required_nodes": "",
"requeue": false,
"resize_time": 0,
"restart_cnt": 0,
"resv_name": "",
"shared": null,
"show_flags": [
"SHOW_ALL",
"SHOW_DETAIL",
"SHOW_LOCAL"
],
"sockets_per_board": 0,
"sockets_per_node": null,
"start_time": 1723457333,
"state_description": "",
"state_reason": "None",
"standard_error": "\/home\/sessiondir\/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",
"standard_input": "\/dev\/null",
"standard_output": "\/home\/sessiondir\/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm.comment",
"submit_time": 1723457333,
"suspend_time": 0,
"system_comment": "",
"time_limit": 3600,
"time_minimum": 0,
"threads_per_core": null,
"tres_bind": "",
"tres_freq": "",
"tres_per_job": "",
"tres_per_node": "",
"tres_per_socket": "",
"tres_per_task": "",
"tres_req_str": "cpu=8,mem=16000M,node=1,billing=8",
"tres_alloc_str": "cpu=8,mem=16000M,node=1,billing=8",
"user_id": 2006,
"user_name": "atl001",
"wckey": "",
"current_working_directory": "\/home\/sessiondir\/nN8KDmNMPx5nKG01gq4B3BRpm7wtQmABFKDmbnHPDmeIKKDml0xJjm"
}
]
}

View file

@ -0,0 +1,175 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"nodes": [
{
"architecture": "x86_64",
"burstbuffer_network_address": "",
"boards": 1,
"boot_time": 1719400973,
"comment": "",
"cores": 18,
"cpu_binding": 0,
"cpu_load": 27,
"extra": "",
"free_memory": 86423,
"cpus": 36,
"last_busy": 1723102876,
"features": "",
"active_features": "",
"gres": "",
"gres_drained": "N\/A",
"gres_used": "",
"mcs_label": "",
"name": "naboo145",
"next_state_after_reboot": "invalid",
"address": "naboo145",
"hostname": "naboo145",
"state": "idle",
"state_flags": [
"DRAIN"
],
"next_state_after_reboot_flags": [
],
"operating_system": "Linux 5.14.0-427.13.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Apr 30 18:22:29 EDT 2024",
"owner": null,
"partitions": [
"atlas"
],
"port": 6818,
"real_memory": 94791,
"reason": "Kill task failed",
"reason_changed_at": 1723077306,
"reason_set_by_user": "root",
"slurmd_start_time": 1720394759,
"sockets": 2,
"threads": 1,
"temporary_disk": 0,
"weight": 1,
"tres": "cpu=36,mem=94791M,billing=36",
"slurmd_version": "22.05.9",
"alloc_memory": 0,
"alloc_cpus": 0,
"idle_cpus": 36,
"tres_used": null,
"tres_weighted": 0.0
},
{
"architecture": "x86_64",
"burstbuffer_network_address": "",
"boards": 1,
"boot_time": 1719400759,
"comment": "",
"cores": 18,
"cpu_binding": 0,
"cpu_load": 0,
"extra": "",
"free_memory": 92151,
"cpus": 36,
"last_busy": 1722780995,
"features": "",
"active_features": "",
"gres": "",
"gres_drained": "N\/A",
"gres_used": "",
"mcs_label": "",
"name": "naboo146",
"next_state_after_reboot": "invalid",
"address": "naboo146",
"hostname": "naboo146",
"state": "idle",
"state_flags": [
"DRAIN"
],
"next_state_after_reboot_flags": [
],
"operating_system": "Linux 5.14.0-427.13.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Apr 30 18:22:29 EDT 2024",
"owner": null,
"partitions": [
"atlas"
],
"port": 6818,
"real_memory": 94791,
"reason": "Kill task failed",
"reason_changed_at": 1722748927,
"reason_set_by_user": "root",
"slurmd_start_time": 1720394759,
"sockets": 2,
"threads": 1,
"temporary_disk": 0,
"weight": 1,
"tres": "cpu=36,mem=94791M,billing=36",
"slurmd_version": "22.05.9",
"alloc_memory": 0,
"alloc_cpus": 0,
"idle_cpus": 36,
"tres_used": null,
"tres_weighted": 0.0
},
{
"architecture": "x86_64",
"burstbuffer_network_address": "",
"boards": 1,
"boot_time": 1719406605,
"comment": "",
"cores": 18,
"cpu_binding": 0,
"cpu_load": 2969,
"extra": "",
"free_memory": 10908,
"cpus": 36,
"last_busy": 1722881704,
"features": "",
"active_features": "",
"gres": "",
"gres_drained": "N\/A",
"gres_used": "",
"mcs_label": "",
"name": "naboo147",
"next_state_after_reboot": "invalid",
"address": "naboo147",
"hostname": "naboo147",
"state": "allocated",
"state_flags": [
],
"next_state_after_reboot_flags": [
],
"operating_system": "Linux 5.14.0-427.13.1.el9_4.x86_64 #1 SMP PREEMPT_DYNAMIC Tue Apr 30 18:22:29 EDT 2024",
"owner": null,
"partitions": [
"atlas"
],
"port": 6818,
"real_memory": 94793,
"reason": "",
"reason_changed_at": 0,
"reason_set_by_user": null,
"slurmd_start_time": 1720394759,
"sockets": 2,
"threads": 1,
"temporary_disk": 0,
"weight": 1,
"tres": "cpu=36,mem=94793M,billing=36",
"slurmd_version": "22.05.9",
"alloc_memory": 56000,
"alloc_cpus": 36,
"idle_cpus": 0,
"tres_used": "cpu=36,mem=56000M",
"tres_weighted": 36.0
}
]
}

View file

@ -0,0 +1,56 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"partitions": [
{
"flags": [
"default"
],
"preemption_mode": [
"disabled"
],
"allowed_allocation_nodes": "",
"allowed_accounts": "",
"allowed_groups": "",
"allowed_qos": "",
"alternative": "",
"billing_weights": "",
"default_memory_per_cpu": null,
"default_memory_per_node": null,
"default_time_limit": null,
"denied_accounts": "",
"denied_qos": "",
"preemption_grace_time": 0,
"maximum_cpus_per_node": -1,
"maximum_memory_per_cpu": null,
"maximum_memory_per_node": null,
"maximum_nodes_per_job": -1,
"max_time_limit": -1,
"min nodes per job": 0,
"name": "atlas",
"nodes": "naboo145,naboo146,naboo147,naboo216,naboo219,naboo222,naboo224,naboo225,naboo227,naboo228,naboo229,naboo234,naboo235,naboo236,naboo237,naboo238,naboo239,naboo240,naboo241,naboo242,naboo243",
"over_time_limit": null,
"priority_job_factor": 1,
"priority_tier": 1,
"qos": "",
"state": "UP",
"total_cpus": 632,
"total_nodes": 21,
"tres": "cpu=632,mem=1415207M,node=21,billing=632"
}
]
}

View file

@ -0,0 +1,20 @@
{
"meta": {
"plugin": {
"type": "openapi\/v0.0.38",
"name": "Slurm OpenAPI v0.0.38"
},
"Slurm": {
"version": {
"major": 22,
"micro": 9,
"minor": 5
},
"release": "22.05.9"
}
},
"errors": [
],
"reservations": [
]
}

View file

@ -0,0 +1,8 @@
[[inputs.slurm]]
url = "willBeOverriden"
response_timeout = "5s"
# enabled_endpoints = []
## Credentials for JWT-based authentication
username = "root"
token = "topSecret"

View file

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"statistics": {}
}

View file

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"jobs": []
}

View file

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"nodes": []
}

View file

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"partitions": []
}

View file

@ -0,0 +1,5 @@
{
"meta": {},
"errors": [],
"reservations": []
}

View file

@ -0,0 +1,8 @@
[[inputs.slurm]]
url = "willBeOverriden"
response_timeout = "5s"
enabled_endpoints = []
## Credentials for JWT-based authentication
username = "root"
token = "topSecret"