1
0
Fork 0

Adding upstream version 1.34.4.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-24 07:26:29 +02:00
parent e393c3af3f
commit 4978089aab
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
4963 changed files with 677545 additions and 0 deletions

View file

@ -0,0 +1,355 @@
# Common vSphere Performance Metrics
The set of performance metrics in vSphere is open ended. Metrics may be added
or removed in new releases and the set of available metrics may vary depending
hardware, as well as what plugins and add-on products are installed. Therefore,
providing a definitive list of available metrics is difficult. The metrics
listed below are the most commonly available as of vSphere 6.5.
For a complete list of metrics available from vSphere and the units they
measure in, please reference the [VMWare Product Documentation][product_doc] or
the [VMWare Performance Manager Documentation][perf_manager_doc].
To list the exact set in your environment, please use the govc tool available
[here](https://github.com/vmware/govmomi/tree/master/govc)
To obtain the set of metrics for e.g. a VM, you may use the following command:
```shell
govc metric.ls vm/*
```
[product_doc]: https://docs.vmware.com/en/VMware-vSphere/7.0/com.vmware.vsphere.monitoring.doc/GUID-FF7F87C7-91E7-4A2D-88B5-E3E04A76F51B.html
[perf_manager_doc]: https://vdc-repo.vmware.com/vmwb-repository/dcr-public/eda658cb-b729-480e-99bc-d3c961055a38/dc769ba5-3cfa-44b1-a5f9-ad807521af19/doc/vim.PerformanceManager.html
## Virtual Machine Metrics
```metrics
cpu.demandEntitlementRatio.latest
cpu.usage.average
cpu.ready.summation
cpu.run.summation
cpu.system.summation
cpu.swapwait.summation
cpu.costop.summation
cpu.demand.average
cpu.readiness.average
cpu.maxlimited.summation
cpu.wait.summation
cpu.usagemhz.average
cpu.latency.average
cpu.used.summation
cpu.overlap.summation
cpu.idle.summation
cpu.entitlement.latest
datastore.maxTotalLatency.latest
disk.usage.average
disk.read.average
disk.write.average
disk.maxTotalLatency.latest
mem.llSwapUsed.average
mem.swapin.average
mem.vmmemctltarget.average
mem.activewrite.average
mem.overhead.average
mem.vmmemctl.average
mem.zero.average
mem.swapoutRate.average
mem.active.average
mem.llSwapOutRate.average
mem.swapout.average
mem.llSwapInRate.average
mem.swapinRate.average
mem.granted.average
mem.latency.average
mem.overheadMax.average
mem.swapped.average
mem.compressionRate.average
mem.swaptarget.average
mem.shared.average
mem.zipSaved.latest
mem.overheadTouched.average
mem.zipped.latest
mem.consumed.average
mem.entitlement.average
mem.usage.average
mem.decompressionRate.average
mem.compressed.average
net.multicastRx.summation
net.transmitted.average
net.received.average
net.usage.average
net.broadcastTx.summation
net.broadcastRx.summation
net.packetsRx.summation
net.pnicBytesRx.average
net.multicastTx.summation
net.bytesTx.average
net.bytesRx.average
net.droppedRx.summation
net.pnicBytesTx.average
net.droppedTx.summation
net.packetsTx.summation
power.power.average
power.energy.summation
rescpu.runpk1.latest
rescpu.runpk15.latest
rescpu.maxLimited5.latest
rescpu.actpk5.latest
rescpu.samplePeriod.latest
rescpu.runav1.latest
rescpu.runav15.latest
rescpu.sampleCount.latest
rescpu.actpk1.latest
rescpu.runpk5.latest
rescpu.runav5.latest
rescpu.actav15.latest
rescpu.actav1.latest
rescpu.actpk15.latest
rescpu.actav5.latest
rescpu.maxLimited1.latest
rescpu.maxLimited15.latest
sys.osUptime.latest
sys.uptime.latest
sys.heartbeat.latest
virtualDisk.write.average
virtualDisk.read.average
```
## Host System Metrics
```metrics
cpu.corecount.contention.average
cpu.usage.average
cpu.reservedCapacity.average
cpu.usagemhz.minimum
cpu.usagemhz.maximum
cpu.usage.minimum
cpu.usage.maximum
cpu.capacity.provisioned.average
cpu.capacity.usage.average
cpu.capacity.demand.average
cpu.capacity.contention.average
cpu.corecount.provisioned.average
cpu.corecount.usage.average
cpu.usagemhz.average
disk.throughput.contention.average
disk.throughput.usage.average
mem.decompressionRate.average
mem.granted.average
mem.active.average
mem.shared.average
mem.zero.average
mem.swapused.average
mem.vmmemctl.average
mem.compressed.average
mem.compressionRate.average
mem.reservedCapacity.average
mem.capacity.provisioned.average
mem.capacity.usable.average
mem.capacity.usage.average
mem.capacity.entitlement.average
mem.capacity.contention.average
mem.usage.minimum
mem.overhead.minimum
mem.consumed.minimum
mem.granted.minimum
mem.active.minimum
mem.shared.minimum
mem.zero.minimum
mem.swapused.minimum
mem.consumed.average
mem.usage.maximum
mem.overhead.maximum
mem.consumed.maximum
mem.granted.maximum
mem.overhead.average
mem.shared.maximum
mem.zero.maximum
mem.swapused.maximum
mem.vmmemctl.maximum
mem.usage.average
mem.active.maximum
mem.vmmemctl.minimum
net.throughput.contention.summation
net.throughput.usage.average
net.throughput.usable.average
net.throughput.provisioned.average
power.power.average
power.powerCap.average
power.energy.summation
vmop.numShutdownGuest.latest
vmop.numPoweroff.latest
vmop.numSuspend.latest
vmop.numReset.latest
vmop.numRebootGuest.latest
vmop.numStandbyGuest.latest
vmop.numPoweron.latest
vmop.numCreate.latest
vmop.numDestroy.latest
vmop.numRegister.latest
vmop.numUnregister.latest
vmop.numReconfigure.latest
vmop.numClone.latest
vmop.numDeploy.latest
vmop.numChangeHost.latest
vmop.numChangeDS.latest
vmop.numChangeHostDS.latest
vmop.numVMotion.latest
vmop.numSVMotion.latest
vmop.numXVMotion.latest
```
## Resource Pool Metrics
```metrics
cpu.usagemhz.average
cpu.cpuentitlement.latest
cpu.usagemhz.minimum
cpu.usagemhz.maximum
cpu.capacity.entitlement.average
cpu.capacity.usage.average
cpu.capacity.demand.average
cpu.capacity.contention.average
cpu.corecount.provisioned.average
cpu.corecount.contention.average
disk.throughput.usage.average
disk.throughput.contention.average
mem.capacity.contention.average
mem.overhead.average
mem.consumed.average
mem.granted.average
mem.active.average
mem.shared.average
mem.zero.average
mem.swapped.average
mem.vmmemctl.average
mem.capacity.provisioned.average
mem.capacity.entitlement.average
mem.capacity.usage.average
mem.mementitlement.latest
mem.compressed.average
mem.compressionRate.average
mem.decompressionRate.average
mem.overhead.minimum
mem.consumed.minimum
mem.granted.minimum
mem.active.minimum
mem.shared.minimum
mem.zero.minimum
mem.swapped.minimum
mem.vmmemctl.maximum
mem.overhead.maximum
mem.consumed.maximum
mem.granted.maximum
mem.active.maximum
mem.shared.maximum
mem.zero.maximum
mem.swapped.maximum
mem.vmmemctl.minimum
net.throughput.usage.average
net.throughput.contention.summation
power.power.average
power.energy.summation
```
## Cluster Metrics
```metrics
cpu.corecount.contention.average
cpu.usage.average
cpu.reservedCapacity.average
cpu.usagemhz.minimum
cpu.usagemhz.maximum
cpu.usage.minimum
cpu.usage.maximum
cpu.capacity.provisioned.average
cpu.capacity.usage.average
cpu.capacity.demand.average
cpu.capacity.contention.average
cpu.corecount.provisioned.average
cpu.corecount.usage.average
cpu.usagemhz.average
disk.throughput.contention.average
disk.throughput.usage.average
mem.decompressionRate.average
mem.granted.average
mem.active.average
mem.shared.average
mem.zero.average
mem.swapused.average
mem.vmmemctl.average
mem.compressed.average
mem.compressionRate.average
mem.reservedCapacity.average
mem.capacity.provisioned.average
mem.capacity.usable.average
mem.capacity.usage.average
mem.capacity.entitlement.average
mem.capacity.contention.average
mem.usage.minimum
mem.overhead.minimum
mem.consumed.minimum
mem.granted.minimum
mem.active.minimum
mem.shared.minimum
mem.zero.minimum
mem.swapused.minimum
mem.consumed.average
mem.usage.maximum
mem.overhead.maximum
mem.consumed.maximum
mem.granted.maximum
mem.overhead.average
mem.shared.maximum
mem.zero.maximum
mem.swapused.maximum
mem.vmmemctl.maximum
mem.usage.average
mem.active.maximum
mem.vmmemctl.minimum
net.throughput.contention.summation
net.throughput.usage.average
net.throughput.usable.average
net.throughput.provisioned.average
power.power.average
power.powerCap.average
power.energy.summation
vmop.numShutdownGuest.latest
vmop.numPoweroff.latest
vmop.numSuspend.latest
vmop.numReset.latest
vmop.numRebootGuest.latest
vmop.numStandbyGuest.latest
vmop.numPoweron.latest
vmop.numCreate.latest
vmop.numDestroy.latest
vmop.numRegister.latest
vmop.numUnregister.latest
vmop.numReconfigure.latest
vmop.numClone.latest
vmop.numDeploy.latest
vmop.numChangeHost.latest
vmop.numChangeDS.latest
vmop.numChangeHostDS.latest
vmop.numVMotion.latest
vmop.numSVMotion.latest
vmop.numXVMotion.latest
```
## Datastore Metrics
```metrics
datastore.numberReadAveraged.average
datastore.throughput.contention.average
datastore.throughput.usage.average
datastore.write.average
datastore.read.average
datastore.numberWriteAveraged.average
disk.used.latest
disk.provisioned.latest
disk.capacity.latest
disk.capacity.contention.average
disk.capacity.provisioned.average
disk.capacity.usage.average
```

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,361 @@
package vsphere
import (
"context"
"crypto/tls"
"fmt"
"net/url"
"strconv"
"strings"
"sync"
"time"
"github.com/vmware/govmomi"
"github.com/vmware/govmomi/object"
"github.com/vmware/govmomi/performance"
"github.com/vmware/govmomi/session"
"github.com/vmware/govmomi/view"
"github.com/vmware/govmomi/vim25"
"github.com/vmware/govmomi/vim25/methods"
"github.com/vmware/govmomi/vim25/soap"
"github.com/vmware/govmomi/vim25/types"
"github.com/influxdata/telegraf"
)
// The highest number of metrics we can query for, no matter what settings
// and server say.
const absoluteMaxMetrics = 10000
// clientFactory is used to obtain Clients to be used throughout the plugin. Typically,
// a single client is reused across all functions and goroutines, but the client
// is periodically recycled to avoid authentication expiration issues.
type clientFactory struct {
client *client
mux sync.Mutex
vSphereURL *url.URL
parent *VSphere
}
// client represents a connection to vSphere and is backed by a govmomi connection
type client struct {
client *govmomi.Client
views *view.Manager
root *view.ContainerView
perf *performance.Manager
valid bool
timeout time.Duration
closeGate sync.Once
log telegraf.Logger
}
// newClientFactory creates a new clientFactory and prepares it for use.
func newClientFactory(vSphereURL *url.URL, parent *VSphere) *clientFactory {
return &clientFactory{
client: nil,
parent: parent,
vSphereURL: vSphereURL,
}
}
// getClient returns a client. The caller is responsible for calling Release()
// on the client once it's done using it.
func (cf *clientFactory) getClient(ctx context.Context) (*client, error) {
cf.mux.Lock()
defer cf.mux.Unlock()
retrying := false
for {
if cf.client == nil {
var err error
if cf.client, err = newClient(ctx, cf.vSphereURL, cf.parent); err != nil {
return nil, err
}
}
err := cf.testClient(ctx)
if err != nil {
if !retrying {
// The client went stale. Probably because someone rebooted vCenter. Clear it to
// force us to create a fresh one. We only get one chance at this. If we fail a second time
// we will simply skip this collection round and hope things have stabilized for the next one.
retrying = true
cf.client = nil
continue
}
return nil, err
}
return cf.client, nil
}
}
func (cf *clientFactory) testClient(ctx context.Context) error {
// Execute a dummy call against the server to make sure the client is
// still functional. If not, try to log back in. If that doesn't work,
// we give up.
ctx1, cancel1 := context.WithTimeout(ctx, time.Duration(cf.parent.Timeout))
defer cancel1()
if _, err := methods.GetCurrentTime(ctx1, cf.client.client); err != nil {
cf.parent.Log.Info("Client session seems to have time out. Reauthenticating!")
ctx2, cancel2 := context.WithTimeout(ctx, time.Duration(cf.parent.Timeout))
defer cancel2()
// Resolving the secrets and construct the authentication info
username, err := cf.parent.Username.Get()
if err != nil {
return fmt.Errorf("getting username failed: %w", err)
}
defer username.Destroy()
password, err := cf.parent.Password.Get()
if err != nil {
return fmt.Errorf("getting password failed: %w", err)
}
defer password.Destroy()
auth := url.UserPassword(username.String(), password.String())
if err := cf.client.client.SessionManager.Login(ctx2, auth); err != nil {
return fmt.Errorf("renewing authentication failed: %w", err)
}
}
return nil
}
// newClient creates a new vSphere client based on the url and setting passed as parameters.
func newClient(ctx context.Context, vSphereURL *url.URL, vs *VSphere) (*client, error) {
sw := newStopwatch("connect", vSphereURL.Host)
defer sw.stop()
tlsCfg, err := vs.ClientConfig.TLSConfig()
if err != nil {
return nil, err
}
// Use a default TLS config if it's missing
if tlsCfg == nil {
tlsCfg = &tls.Config{}
}
if !vs.Username.Empty() {
// Resolving the secrets and construct the authentication info
username, err := vs.Username.Get()
if err != nil {
return nil, fmt.Errorf("getting username failed: %w", err)
}
password, err := vs.Password.Get()
if err != nil {
username.Destroy()
return nil, fmt.Errorf("getting password failed: %w", err)
}
vSphereURL.User = url.UserPassword(username.String(), password.String())
username.Destroy()
password.Destroy()
}
vs.Log.Debugf("Creating client: %s", vSphereURL.Host)
soapClient := soap.NewClient(vSphereURL, tlsCfg.InsecureSkipVerify)
// Add certificate if we have it. Use it to log us in.
if len(tlsCfg.Certificates) > 0 {
soapClient.SetCertificate(tlsCfg.Certificates[0])
}
// Set up custom CA chain if specified. We need to do this before we create the vim25 client,
// since it might fail on missing CA chains otherwise.
if vs.TLSCA != "" {
if err := soapClient.SetRootCAs(vs.TLSCA); err != nil {
return nil, err
}
}
// Set the proxy dependent on the settings
proxy, err := vs.HTTPProxy.Proxy()
if err != nil {
return nil, fmt.Errorf("creating proxy failed: %w", err)
}
transport := soapClient.DefaultTransport()
transport.Proxy = proxy
soapClient.Client.Transport = transport
ctx1, cancel1 := context.WithTimeout(ctx, time.Duration(vs.Timeout))
defer cancel1()
vimClient, err := vim25.NewClient(ctx1, soapClient)
if err != nil {
return nil, err
}
sm := session.NewManager(vimClient)
// If TSLKey is specified, try to log in as an extension using a cert.
if vs.TLSKey != "" {
ctx2, cancel2 := context.WithTimeout(ctx, time.Duration(vs.Timeout))
defer cancel2()
if err := sm.LoginExtensionByCertificate(ctx2, vs.TLSKey); err != nil {
return nil, err
}
}
// Create the govmomi client.
c := &govmomi.Client{
Client: vimClient,
SessionManager: sm,
}
// Only login if the URL contains user information.
if vSphereURL.User != nil {
if err := c.Login(ctx, vSphereURL.User); err != nil {
return nil, err
}
}
c.Timeout = time.Duration(vs.Timeout)
m := view.NewManager(c.Client)
v, err := m.CreateContainerView(ctx, c.ServiceContent.RootFolder, make([]string, 0), true)
if err != nil {
return nil, err
}
p := performance.NewManager(c.Client)
client := &client{
log: vs.Log,
client: c,
views: m,
root: v,
perf: p,
valid: true,
timeout: time.Duration(vs.Timeout),
}
// Adjust max query size if needed
ctx3, cancel3 := context.WithTimeout(ctx, time.Duration(vs.Timeout))
defer cancel3()
n, err := client.getMaxQueryMetrics(ctx3)
if err != nil {
return nil, err
}
vs.Log.Debugf("vCenter says max_query_metrics should be %d", n)
if n < vs.MaxQueryMetrics {
vs.Log.Warnf("Configured max_query_metrics is %d, but server limits it to %d. Reducing.", vs.MaxQueryMetrics, n)
vs.MaxQueryMetrics = n
}
return client, nil
}
// close shuts down a clientFactory and releases any resources associated with it.
func (cf *clientFactory) close() {
cf.mux.Lock()
defer cf.mux.Unlock()
if cf.client != nil {
cf.client.close()
}
}
func (c *client) close() {
// Use a Once to prevent us from panics stemming from trying
// to close it multiple times.
c.closeGate.Do(func() {
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()
if c.client != nil {
if err := c.client.Logout(ctx); err != nil {
c.log.Errorf("Logout: %s", err.Error())
}
}
})
}
// getServerTime returns the time at the vCenter server
func (c *client) getServerTime(ctx context.Context) (time.Time, error) {
ctx, cancel := context.WithTimeout(ctx, c.timeout)
defer cancel()
t, err := methods.GetCurrentTime(ctx, c.client)
if err != nil {
return time.Time{}, err
}
return *t, nil
}
// getMaxQueryMetrics returns the max_query_metrics setting as configured in vCenter
func (c *client) getMaxQueryMetrics(ctx context.Context) (int, error) {
ctx, cancel := context.WithTimeout(ctx, c.timeout)
defer cancel()
om := object.NewOptionManager(c.client.Client, *c.client.Client.ServiceContent.Setting)
res, err := om.Query(ctx, "config.vpxd.stats.maxQueryMetrics")
if err == nil {
if len(res) > 0 {
if s, ok := res[0].GetOptionValue().Value.(string); ok {
v, err := strconv.Atoi(s)
if err == nil {
c.log.Debugf("vCenter maxQueryMetrics is defined: %d", v)
if v == -1 {
// Whatever the server says, we never ask for more metrics than this.
return absoluteMaxMetrics, nil
}
return v, nil
}
}
// Fall through version-based inference if value isn't usable
}
} else {
c.log.Debug("Option query for maxQueryMetrics failed. Using default")
}
// No usable maxQueryMetrics setting. Infer based on version
ver := c.client.Client.ServiceContent.About.Version
parts := strings.Split(ver, ".")
if len(parts) < 2 {
c.log.Warnf("vCenter returned an invalid version string: %s. Using default query size=64", ver)
return 64, nil
}
c.log.Debugf("vCenter version is: %s", ver)
major, err := strconv.Atoi(parts[0])
if err != nil {
return 0, err
}
if major < 6 || major == 6 && parts[1] == "0" {
return 64, nil
}
return 256, nil
}
// queryMetrics wraps performance.Query to give it proper timeouts
func (c *client) queryMetrics(ctx context.Context, pqs []types.PerfQuerySpec) ([]performance.EntityMetric, error) {
ctx1, cancel1 := context.WithTimeout(ctx, c.timeout)
defer cancel1()
metrics, err := c.perf.Query(ctx1, pqs)
if err != nil {
return nil, err
}
ctx2, cancel2 := context.WithTimeout(ctx, c.timeout)
defer cancel2()
return c.perf.ToMetricSeries(ctx2, metrics)
}
// counterInfoByName wraps performance.counterInfoByName to give it proper timeouts
func (c *client) counterInfoByName(ctx context.Context) (map[string]*types.PerfCounterInfo, error) {
ctx1, cancel1 := context.WithTimeout(ctx, c.timeout)
defer cancel1()
return c.perf.CounterInfoByName(ctx1)
}
// counterInfoByKey wraps performance.counterInfoByKey to give it proper timeouts
func (c *client) counterInfoByKey(ctx context.Context) (map[int32]*types.PerfCounterInfo, error) {
ctx1, cancel1 := context.WithTimeout(ctx, c.timeout)
defer cancel1()
return c.perf.CounterInfoByKey(ctx1)
}
func (c *client) getCustomFields(ctx context.Context) (map[int32]string, error) {
ctx1, cancel1 := context.WithTimeout(ctx, c.timeout)
defer cancel1()
cfm := object.NewCustomFieldsManager(c.client.Client)
fields, err := cfm.Field(ctx1)
if err != nil {
return nil, err
}
r := make(map[int32]string)
for _, f := range fields {
r[f.Key] = f.Name
}
return r, nil
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,276 @@
package vsphere
import (
"context"
"reflect"
"strings"
"github.com/vmware/govmomi/property"
"github.com/vmware/govmomi/view"
"github.com/vmware/govmomi/vim25/mo"
"github.com/vmware/govmomi/vim25/types"
)
var childTypes map[string][]string
var addFields map[string][]string
var containers map[string]interface{}
// finder allows callers to find resources in vCenter given a query string.
type finder struct {
client *client
}
// resourceFilter is a convenience class holding a finder and a set of paths. It is useful when you need a
// self contained object capable of returning a certain set of resources.
type resourceFilter struct {
finder *finder
resType string
paths []string
excludePaths []string
}
// findAll returns the union of resources found given the supplied resource type and paths.
func (f *finder) findAll(ctx context.Context, resType string, paths, excludePaths []string, dst interface{}) error {
objs := make(map[string]types.ObjectContent)
for _, p := range paths {
if err := f.findResources(ctx, resType, p, objs); err != nil {
return err
}
}
if len(excludePaths) > 0 {
excludes := make(map[string]types.ObjectContent)
for _, p := range excludePaths {
if err := f.findResources(ctx, resType, p, excludes); err != nil {
return err
}
}
for k := range excludes {
delete(objs, k)
}
}
return objectContentToTypedArray(objs, dst)
}
// find returns the resources matching the specified path.
func (f *finder) find(ctx context.Context, resType, path string, dst interface{}) error {
objs := make(map[string]types.ObjectContent)
err := f.findResources(ctx, resType, path, objs)
if err != nil {
return err
}
return objectContentToTypedArray(objs, dst)
}
func (f *finder) findResources(ctx context.Context, resType, path string, objs map[string]types.ObjectContent) error {
p := strings.Split(path, "/")
flt := make([]property.Match, len(p)-1)
for i := 1; i < len(p); i++ {
flt[i-1] = property.Match{"name": p[i]}
}
err := f.descend(ctx, f.client.client.ServiceContent.RootFolder, resType, flt, 0, objs)
if err != nil {
return err
}
f.client.log.Debugf("Find(%s, %s) returned %d objects", resType, path, len(objs))
return nil
}
func (f *finder) descend(ctx context.Context, root types.ManagedObjectReference, resType string,
tokens []property.Match, pos int, objs map[string]types.ObjectContent) error {
isLeaf := pos == len(tokens)-1
// No more tokens to match?
if pos >= len(tokens) {
return nil
}
// Determine child types
ct, ok := childTypes[root.Reference().Type]
if !ok {
// We don't know how to handle children of this type. Stop descending.
return nil
}
m := view.NewManager(f.client.client.Client)
v, err := m.CreateContainerView(ctx, root, ct, false)
if err != nil {
return err
}
defer v.Destroy(ctx) //nolint:errcheck // Ignore the returned error as we cannot do anything about it anyway
var content []types.ObjectContent
fields := []string{"name"}
recurse := tokens[pos]["name"] == "**"
objectTypes := ct
if isLeaf {
if af, ok := addFields[resType]; ok {
fields = append(fields, af...)
}
if recurse {
// Special case: The last token is a recursive wildcard, so we can grab everything
// recursively in a single call.
v2, err := m.CreateContainerView(ctx, root, []string{resType}, true)
if err != nil {
return err
}
defer v2.Destroy(ctx) //nolint:errcheck // Ignore the returned error as we cannot do anything about it anyway
err = v2.Retrieve(ctx, []string{resType}, fields, &content)
if err != nil {
return err
}
for _, c := range content {
objs[c.Obj.String()] = c
}
return nil
}
objectTypes = []string{resType} // Only load wanted object type at leaf level
}
err = v.Retrieve(ctx, objectTypes, fields, &content)
if err != nil {
return err
}
rerunAsLeaf := false
for _, c := range content {
if !matchName(tokens[pos], c.PropSet) {
continue
}
// Already been here through another path? Skip!
if _, ok := objs[root.Reference().String()]; ok {
continue
}
if c.Obj.Type == resType && isLeaf {
// We found what we're looking for. Consider it a leaf and stop descending
objs[c.Obj.String()] = c
continue
}
// Deal with recursive wildcards (**)
var inc int
if recurse {
inc = 0 // By default, we stay on this token
// Lookahead to next token.
if matchName(tokens[pos+1], c.PropSet) {
// Are we looking ahead at a leaf node that has the wanted type?
// Rerun the entire level as a leaf. This is needed since all properties aren't loaded
// when we're processing non-leaf nodes.
if pos == len(tokens)-2 {
if c.Obj.Type == resType {
rerunAsLeaf = true
continue
}
} else if _, ok := containers[c.Obj.Type]; ok {
// Tokens match and we're looking ahead at a container type that's not a leaf
// Consume this token and the next.
inc = 2
}
}
} else {
// The normal case: Advance to next token before descending
inc = 1
}
err := f.descend(ctx, c.Obj, resType, tokens, pos+inc, objs)
if err != nil {
return err
}
}
if rerunAsLeaf {
// We're at a "pseudo leaf", i.e. we looked ahead a token and found that this level contains leaf nodes.
// Rerun the entire level as a leaf to get those nodes. This will only be executed when pos is one token
// before the last, to pos+1 will always point to a leaf token.
return f.descend(ctx, root, resType, tokens, pos+1, objs)
}
return nil
}
func objectContentToTypedArray(objs map[string]types.ObjectContent, dst interface{}) error {
rt := reflect.TypeOf(dst)
if rt == nil || rt.Kind() != reflect.Ptr {
panic("need pointer")
}
rv := reflect.ValueOf(dst).Elem()
if !rv.CanSet() {
panic("cannot set dst")
}
for _, p := range objs {
v, err := mo.ObjectContentToType(p)
if err != nil {
return err
}
vt := reflect.TypeOf(v)
if !rv.Type().AssignableTo(vt) {
// For example: dst is []ManagedEntity, res is []HostSystem
if field, ok := vt.FieldByName(rt.Elem().Elem().Name()); ok && field.Anonymous {
rv.Set(reflect.Append(rv, reflect.ValueOf(v).FieldByIndex(field.Index)))
continue
}
}
rv.Set(reflect.Append(rv, reflect.ValueOf(v)))
}
return nil
}
// findAll finds all resources matching the paths that were specified upon creation of the resourceFilter.
func (r *resourceFilter) findAll(ctx context.Context, dst interface{}) error {
return r.finder.findAll(ctx, r.resType, r.paths, r.excludePaths, dst)
}
func matchName(f property.Match, props []types.DynamicProperty) bool {
for _, prop := range props {
if prop.Name == "name" {
return f.Property(prop)
}
}
return false
}
func init() {
childTypes = map[string][]string{
"HostSystem": {"VirtualMachine"},
"ResourcePool": {"VirtualMachine"},
"ComputeResource": {"HostSystem", "ResourcePool", "VirtualApp"},
"ClusterComputeResource": {"HostSystem", "ResourcePool", "VirtualApp"},
"Datacenter": {"Folder"},
"Folder": {
"Folder",
"Datacenter",
"VirtualMachine",
"ComputeResource",
"ClusterComputeResource",
"Datastore",
},
}
addFields = map[string][]string{
"HostSystem": {"parent", "summary.customValue", "customValue"},
"ResourcePool": {"parent", "customValue"},
"VirtualMachine": {"runtime.host", "config.guestId", "config.uuid", "runtime.powerState",
"summary.customValue", "summary.config.memorySizeMB", "guest.guestId", "guest.net", "guest.hostName",
"resourcePool", "customValue"},
"Datastore": {"parent", "info", "customValue"},
"ClusterComputeResource": {"parent", "customValue"},
"Datacenter": {"parent", "customValue"},
"HostNumericSensorInfo": {"parent", "temperature", "baseUnits"},
}
containers = map[string]interface{}{
"HostSystem": nil,
"ComputeResource": nil,
"Datacenter": nil,
"ResourcePool": nil,
"Folder": nil,
"VirtualApp": nil,
}
}

View file

@ -0,0 +1,215 @@
# Read metrics from one or many vCenters
[[inputs.vsphere]]
## List of vCenter URLs to be monitored. These three lines must be uncommented
## and edited for the plugin to work.
vcenters = [ "https://vcenter.local/sdk" ]
username = "user@corp.local"
password = "secret"
## VMs
## Typical VM metrics (if omitted or empty, all metrics are collected)
# vm_include = [ "/*/vm/**"] # Inventory path to VMs to collect (by default all are collected)
# vm_exclude = [] # Inventory paths to exclude
vm_metric_include = [
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.run.summation",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.wait.summation",
"mem.active.average",
"mem.granted.average",
"mem.latency.average",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.usage.average",
"power.power.average",
"virtualDisk.numberReadAveraged.average",
"virtualDisk.numberWriteAveraged.average",
"virtualDisk.read.average",
"virtualDisk.readOIO.latest",
"virtualDisk.throughput.usage.average",
"virtualDisk.totalReadLatency.average",
"virtualDisk.totalWriteLatency.average",
"virtualDisk.write.average",
"virtualDisk.writeOIO.latest",
"sys.uptime.latest",
]
# vm_metric_exclude = [] ## Nothing is excluded by default
# vm_instances = true ## true by default
## Hosts
## Typical host metrics (if omitted or empty, all metrics are collected)
# host_include = [ "/*/host/**"] # Inventory path to hosts to collect (by default all are collected)
# host_exclude [] # Inventory paths to exclude
host_metric_include = [
"cpu.coreUtilization.average",
"cpu.costop.summation",
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.swapwait.summation",
"cpu.usage.average",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.utilization.average",
"cpu.wait.summation",
"disk.deviceReadLatency.average",
"disk.deviceWriteLatency.average",
"disk.kernelReadLatency.average",
"disk.kernelWriteLatency.average",
"disk.numberReadAveraged.average",
"disk.numberWriteAveraged.average",
"disk.read.average",
"disk.totalReadLatency.average",
"disk.totalWriteLatency.average",
"disk.write.average",
"mem.active.average",
"mem.latency.average",
"mem.state.latest",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.totalCapacity.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.errorsRx.summation",
"net.errorsTx.summation",
"net.usage.average",
"power.power.average",
"storageAdapter.numberReadAveraged.average",
"storageAdapter.numberWriteAveraged.average",
"storageAdapter.read.average",
"storageAdapter.write.average",
"sys.uptime.latest",
]
## Collect IP addresses? Valid values are "ipv4" and "ipv6"
# ip_addresses = ["ipv6", "ipv4" ]
# host_metric_exclude = [] ## Nothing excluded by default
# host_instances = true ## true by default
## Clusters
# cluster_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# cluster_exclude = [] # Inventory paths to exclude
# cluster_metric_include = [] ## if omitted or empty, all metrics are collected
# cluster_metric_exclude = [] ## Nothing excluded by default
# cluster_instances = false ## false by default
## Resource Pools
# resource_pool_include = [ "/*/host/**"] # Inventory path to resource pools to collect (by default all are collected)
# resource_pool_exclude = [] # Inventory paths to exclude
# resource_pool_metric_include = [] ## if omitted or empty, all metrics are collected
# resource_pool_metric_exclude = [] ## Nothing excluded by default
# resource_pool_instances = false ## false by default
## Datastores
# datastore_include = [ "/*/datastore/**"] # Inventory path to datastores to collect (by default all are collected)
# datastore_exclude = [] # Inventory paths to exclude
# datastore_metric_include = [] ## if omitted or empty, all metrics are collected
# datastore_metric_exclude = [] ## Nothing excluded by default
# datastore_instances = false ## false by default
## Datacenters
# datacenter_include = [ "/*/host/**"] # Inventory path to clusters to collect (by default all are collected)
# datacenter_exclude = [] # Inventory paths to exclude
datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default
## VSAN
# vsan_metric_include = [] ## if omitted or empty, all metrics are collected
# vsan_metric_exclude = [ "*" ] ## vSAN are not collected by default.
## Whether to skip verifying vSAN metrics against the ones from GetSupportedEntityTypes API.
# vsan_metric_skip_verify = false ## false by default.
## Interval for sampling vSAN performance metrics, can be reduced down to
## 30 seconds for vSAN 8 U1.
# vsan_interval = "5m"
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"
## number of objects to retrieve per query for realtime resources (vms and hosts)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_objects = 256
## number of metrics to retrieve per query for non-realtime resources (clusters and datastores)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_metrics = 256
## number of go routines to use for collection and discovery of objects and metrics
# collect_concurrency = 1
# discover_concurrency = 1
## the interval before (re)discovering objects subject to metrics collection (default: 300s)
# object_discovery_interval = "300s"
## timeout applies to any of the api request made to vcenter
# timeout = "60s"
## When set to true, all samples are sent as integers. This makes the output
## data types backwards compatible with Telegraf 1.9 or lower. Normally all
## samples from vCenter, with the exception of percentages, are integer
## values, but under some conditions, some averaging takes place internally in
## the plugin. Setting this flag to "false" will send values as floats to
## preserve the full precision when averaging takes place.
# use_int_samples = true
## Custom attributes from vCenter can be very useful for queries in order to slice the
## metrics along different dimension and for forming ad-hoc relationships. They are disabled
## by default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
## By default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exclude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
# custom_attribute_include = []
# custom_attribute_exclude = ["*"]
## The number of vSphere 5 minute metric collection cycles to look back for non-realtime metrics. In
## some versions (6.7, 7.0 and possible more), certain metrics, such as cluster metrics, may be reported
## with a significant delay (>30min). If this happens, try increasing this number. Please note that increasing
## it too much may cause performance issues.
# metric_lookback = 3
## Optional SSL Config
# ssl_ca = "/path/to/cafile"
# ssl_cert = "/path/to/certfile"
# ssl_key = "/path/to/keyfile"
## Use SSL but skip chain & host verification
# insecure_skip_verify = false
## The Historical Interval value must match EXACTLY the interval in the daily
# "Interval Duration" found on the VCenter server under Configure > General > Statistics > Statistic intervals
# historical_interval = "5m"
## Specifies plugin behavior regarding disconnected servers
## Available choices :
## - error: telegraf will return an error on startup if one the servers is unreachable
## - ignore: telegraf will ignore unreachable servers on both startup and gather
# disconnected_servers_behavior = "error"
## HTTP Proxy support
# use_system_proxy = true
# http_proxy_url = ""

View file

@ -0,0 +1,42 @@
package vsphere
import (
"time"
"github.com/influxdata/telegraf/selfstat"
)
// stopwatch is a simple helper for recording timing information, such as gather times and discovery times.
type stopwatch struct {
stat selfstat.Stat
start time.Time
}
// newStopwatch creates a new StopWatch and starts measuring time its creation.
func newStopwatch(name, vCenter string) *stopwatch {
return &stopwatch{
stat: selfstat.RegisterTiming("vsphere", name+"_ns", map[string]string{"vcenter": vCenter}),
start: time.Now(),
}
}
// newStopwatchWithTags creates a new StopWatch and starts measuring time its creation. Allows additional tags.
func newStopwatchWithTags(name, vCenter string, tags map[string]string) *stopwatch {
tags["vcenter"] = vCenter
return &stopwatch{
stat: selfstat.RegisterTiming("vsphere", name+"_ns", tags),
start: time.Now(),
}
}
// stop stops a stopwatch and records the time.
func (s *stopwatch) stop() {
s.stat.Set(time.Since(s.start).Nanoseconds())
}
// sendInternalCounterWithTags is a convenience method for sending non-timing internal metrics. Allows additional tags
func sendInternalCounterWithTags(name, vCenter string, tags map[string]string, value int64) {
tags["vcenter"] = vCenter
s := selfstat.Register("vsphere", name, tags)
s.Set(value)
}

View file

@ -0,0 +1,44 @@
package vsphere
import (
"context"
"sync"
)
// throttledExecutor provides a simple mechanism for running jobs in separate
// goroutines while limit the number of concurrent jobs running at any given time.
type throttledExecutor struct {
limiter chan struct{}
wg sync.WaitGroup
}
// newThrottledExecutor creates a new ThrottlesExecutor with a specified maximum
// number of concurrent jobs
func newThrottledExecutor(limit int) *throttledExecutor {
if limit == 0 {
panic("Limit must be > 0")
}
return &throttledExecutor{limiter: make(chan struct{}, limit)}
}
// run schedules a job for execution as soon as possible while respecting the maximum concurrency limit.
func (t *throttledExecutor) run(ctx context.Context, job func()) {
t.wg.Add(1)
go func() {
defer t.wg.Done()
select {
case t.limiter <- struct{}{}:
defer func() {
<-t.limiter
}()
job()
case <-ctx.Done():
return
}
}()
}
// wait blocks until all scheduled jobs have finished
func (t *throttledExecutor) wait() {
t.wg.Wait()
}

View file

@ -0,0 +1,61 @@
package vsphere
import (
"sync"
"time"
"github.com/influxdata/telegraf"
)
// tsCache is a cache of timestamps used to determine the validity of datapoints
type tsCache struct {
ttl time.Duration
table map[string]time.Time
mux sync.RWMutex
log telegraf.Logger
}
// newTSCache creates a new tsCache with a specified time-to-live after which timestamps are discarded.
func newTSCache(ttl time.Duration, log telegraf.Logger) *tsCache {
return &tsCache{
ttl: ttl,
table: make(map[string]time.Time),
log: log,
}
}
// purge removes timestamps that are older than the time-to-live
func (t *tsCache) purge() {
t.mux.Lock()
defer t.mux.Unlock()
n := 0
for k, v := range t.table {
if time.Since(v) > t.ttl {
delete(t.table, k)
n++
}
}
t.log.Debugf("purged timestamp cache. %d deleted with %d remaining", n, len(t.table))
}
// get returns a timestamp (if present)
func (t *tsCache) get(key, metricName string) (time.Time, bool) {
t.mux.RLock()
defer t.mux.RUnlock()
ts, ok := t.table[makeKey(key, metricName)]
return ts, ok
}
// put updates the latest timestamp for the supplied key.
func (t *tsCache) put(key, metricName string, timestamp time.Time) {
t.mux.Lock()
defer t.mux.Unlock()
k := makeKey(key, metricName)
if timestamp.After(t.table[k]) {
t.table[k] = timestamp
}
}
func makeKey(resource, metric string) string {
return resource + "|" + metric
}

View file

@ -0,0 +1,543 @@
package vsphere
import (
"context"
"encoding/json"
"errors"
"fmt"
"strconv"
"strings"
"time"
"github.com/vmware/govmomi/object"
"github.com/vmware/govmomi/vim25"
"github.com/vmware/govmomi/vim25/methods"
"github.com/vmware/govmomi/vim25/soap"
"github.com/vmware/govmomi/vim25/types"
vsanmethods "github.com/vmware/govmomi/vsan/methods"
vsantypes "github.com/vmware/govmomi/vsan/types"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
)
const (
vsanNamespace = "vsan"
vsanPath = "/vsanHealth"
hwMarksKeyPrefix = "vsan-perf-"
perfPrefix = "performance."
)
var (
vsanPerfMetricsName string
vsanSummaryMetricsName string
perfManagerRef = types.ManagedObjectReference{
Type: "VsanPerformanceManager",
Value: "vsan-performance-manager",
}
hyphenReplacer = strings.NewReplacer("-", "")
)
// collectVsan is the entry point for vsan metrics collection
func (e *endpoint) collectVsan(ctx context.Context, acc telegraf.Accumulator) error {
lower := versionLowerThan(e.apiVersion, 5, 5)
if lower {
return fmt.Errorf("a minimum API version of 5.5 is required for vSAN. Found: %s. Skipping vCenter: %s", e.apiVersion, e.url.Host)
}
vsanPerfMetricsName = strings.Join([]string{"vsphere", "vsan", "performance"}, e.parent.Separator)
vsanSummaryMetricsName = strings.Join([]string{"vsphere", "vsan", "summary"}, e.parent.Separator)
res := e.resourceKinds["vsan"]
client, err := e.clientFactory.getClient(ctx)
if err != nil {
return fmt.Errorf("fail to get client when collect vsan: %w", err)
}
// Create vSAN client
vimClient := client.client.Client
vsanClient := vimClient.NewServiceClient(vsanPath, vsanNamespace)
// vSAN Metrics to collect
metrics := e.getVsanMetadata(ctx, vsanClient, res)
// Iterate over all clusters, run a goroutine for each cluster
te := newThrottledExecutor(e.parent.CollectConcurrency)
for _, obj := range res.objects {
te.run(ctx, func() {
e.collectVsanPerCluster(ctx, obj, vimClient, vsanClient, metrics, acc)
})
}
te.wait()
return nil
}
// collectVsanPerCluster is called by goroutines in collectVsan function.
func (e *endpoint) collectVsanPerCluster(ctx context.Context, clusterRef *objectRef, vimClient *vim25.Client, vsanClient *soap.Client,
metrics map[string]string, acc telegraf.Accumulator) {
// Construct a map for cmmds
cluster := object.NewClusterComputeResource(vimClient, clusterRef.ref)
if !vsanEnabled(ctx, cluster) {
acc.AddError(fmt.Errorf("[vSAN] Fail to identify vSAN for cluster %s. Skipping", clusterRef.name))
return
}
// Do collection
if _, ok := metrics["summary.disk-usage"]; ok {
if err := e.queryDiskUsage(ctx, vsanClient, clusterRef, acc); err != nil {
acc.AddError(fmt.Errorf("error querying disk usage for cluster %s: %w", clusterRef.name, err))
}
}
if _, ok := metrics["summary.health"]; ok {
if err := e.queryHealthSummary(ctx, vsanClient, clusterRef, acc); err != nil {
acc.AddError(fmt.Errorf("error querying vsan health summary for cluster %s: %w", clusterRef.name, err))
}
}
if _, ok := metrics["summary.resync"]; ok {
if err := e.queryResyncSummary(ctx, vsanClient, cluster, clusterRef, acc); err != nil {
acc.AddError(fmt.Errorf("error querying vsan resync summary for cluster %s: %w", clusterRef.name, err))
}
}
cmmds, err := getCmmdsMap(ctx, vimClient, cluster)
if err != nil {
e.parent.Log.Errorf("[vSAN] Error while query cmmds data. Error: %s. Skipping", err)
cmmds = make(map[string]cmmdsEntity)
}
if err := e.queryPerformance(ctx, vsanClient, clusterRef, metrics, cmmds, acc); err != nil {
acc.AddError(fmt.Errorf("error querying performance metrics for cluster %s: %w", clusterRef.name, err))
}
}
// vsanEnabled returns True if vSAN is enabled, otherwise False
func vsanEnabled(ctx context.Context, clusterObj *object.ClusterComputeResource) bool {
config, err := clusterObj.Configuration(ctx)
if err != nil {
return false
}
enabled := config.VsanConfigInfo.Enabled
return enabled != nil && *enabled
}
// getVsanMetadata returns a string list of the entity types that will be queried.
// e.g ["summary.health", "summary.disk-usage", "summary.resync", "performance.cluster-domclient", "performance.host-domclient"]
func (e *endpoint) getVsanMetadata(ctx context.Context, vsanClient *soap.Client, res *resourceKind) map[string]string {
metrics := make(map[string]string)
if res.simple { // Skip getting supported Entity types from vCenter. Using user defined metrics without verifying.
for _, entity := range res.include {
if strings.Contains(entity, "*") {
e.parent.Log.Infof("[vSAN] Won't use wildcard match \"*\" when vsan_metric_skip_verify = true. Skipping")
continue
}
metrics[entity] = ""
}
return metrics
}
// Use the include & exclude configuration to filter all summary metrics
for _, entity := range []string{"summary.health", "summary.disk-usage", "summary.resync"} {
if res.filters.Match(entity) {
metrics[entity] = ""
}
}
resp, err := vsanmethods.VsanPerfGetSupportedEntityTypes(ctx, vsanClient,
&vsantypes.VsanPerfGetSupportedEntityTypes{
This: perfManagerRef,
})
if err != nil {
e.parent.Log.Errorf("[vSAN] Fail to get supported entities: %v. Skipping vsan performance data.", err)
return metrics
}
// Use the include & exclude configuration to filter all supported performance metrics
for _, entity := range resp.Returnval {
if res.filters.Match(perfPrefix + entity.Name) {
metrics[perfPrefix+entity.Name] = ""
}
}
return metrics
}
// getCmmdsMap returns a map which maps a uuid to a cmmdsEntity
func getCmmdsMap(ctx context.Context, client *vim25.Client, clusterObj *object.ClusterComputeResource) (map[string]cmmdsEntity, error) {
hosts, err := clusterObj.Hosts(ctx)
if err != nil {
return nil, fmt.Errorf("fail to get host: %w", err)
}
if len(hosts) == 0 {
return make(map[string]cmmdsEntity), nil
}
queries := []types.HostVsanInternalSystemCmmdsQuery{
{Type: "HOSTNAME"},
{Type: "DISK"},
{Type: "DISK_CAPACITY_TIER"},
}
// Some esx host can be down or in maintenance mode. Hence cmmds query might fail on such hosts.
// We iterate until be get proper api response
var resp *types.QueryCmmdsResponse
for _, host := range hosts {
vis, err := host.ConfigManager().VsanInternalSystem(ctx)
if err != nil {
continue
}
request := types.QueryCmmds{
This: vis.Reference(),
Queries: queries,
}
resp, err = methods.QueryCmmds(ctx, client.RoundTripper, &request)
if err == nil {
break
}
}
if resp == nil {
return nil, errors.New("all hosts fail to query cmmds")
}
var clusterCmmds cmmds
if err := json.Unmarshal([]byte(resp.Returnval), &clusterCmmds); err != nil {
return nil, fmt.Errorf("fail to convert cmmds to json: %w", err)
}
cmmdsMap := make(map[string]cmmdsEntity)
for _, entity := range clusterCmmds.Res {
cmmdsMap[entity.UUID] = entity
}
return cmmdsMap, nil
}
// queryPerformance adds performance metrics to telegraf accumulator
func (e *endpoint) queryPerformance(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, metrics map[string]string,
cmmds map[string]cmmdsEntity, acc telegraf.Accumulator) error {
end := time.Now().UTC()
// We're using a fake metric key, since we only store one highwater mark per resource
start, ok := e.hwMarks.get(hwMarksKeyPrefix+clusterRef.ref.Value, "generic")
if !ok {
// Look back 3 sampling periods by default
start = end.Add(time.Duration(e.parent.MetricLookback) * time.Duration(-e.resourceKinds["vsan"].sampling) * time.Second)
}
e.parent.Log.Debugf("[vSAN] Query vsan performance for time interval: %s ~ %s", start, end)
latest := start
var commonError error
for entityRefID := range metrics {
if !strings.HasPrefix(entityRefID, perfPrefix) {
continue
}
entityRefID = strings.TrimPrefix(entityRefID, perfPrefix)
var perfSpecs []vsantypes.VsanPerfQuerySpec
perfSpec := vsantypes.VsanPerfQuerySpec{
EntityRefId: entityRefID + ":*",
StartTime: &start,
EndTime: &end,
}
perfSpecs = append(perfSpecs, perfSpec)
perfRequest := vsantypes.VsanPerfQueryPerf{
This: perfManagerRef,
QuerySpecs: perfSpecs,
Cluster: &clusterRef.ref,
}
resp, err := vsanmethods.VsanPerfQueryPerf(ctx, vsanClient, &perfRequest)
if err != nil {
if err.Error() == "ServerFaultCode: NotFound" {
e.parent.Log.Errorf("[vSAN] Is vSAN performance service enabled for %s? Skipping ...", clusterRef.name)
commonError = err
break
}
e.parent.Log.Errorf("[vSAN] Error querying performance data for %s: %s: %s.", clusterRef.name, entityRefID, err)
continue
}
tags := populateClusterTags(make(map[string]string), clusterRef, e.url.Host)
count := 0
for _, em := range resp.Returnval {
vals := strings.Split(em.EntityRefId, ":")
var entityName, uuid string
if len(vals) == 1 {
entityName, uuid = vals[0], ""
} else {
entityName, uuid = vals[0], vals[1]
}
buckets := make(map[string]metricEntry)
tags := populateCMMDSTags(tags, entityName, uuid, cmmds)
var timeStamps []time.Time
// 1. Construct a timestamp list from sample info
formattedEntityName := hyphenReplacer.Replace(entityName)
for _, t := range strings.Split(em.SampleInfo, ",") {
// Parse the input string to a time.Time object
utcTimeStamp, err := time.Parse("2006-01-02 15:04:05", t)
if err != nil {
e.parent.Log.Errorf("[vSAN] Failed to parse a timestamp: %s. Skipping", utcTimeStamp)
timeStamps = append(timeStamps, time.Time{})
continue
}
timeStamps = append(timeStamps, utcTimeStamp)
}
// 2. Iterate on each measurement
for _, counter := range em.Value {
metricLabel := internal.SnakeCase(counter.MetricId.Label)
// 3. Iterate on each data point.
for i, values := range strings.Split(counter.Values, ",") {
ts := timeStamps[i]
if ts.IsZero() {
continue
}
// Organize the metrics into a bucket per measurement.
bKey := em.EntityRefId + " " + strconv.FormatInt(ts.UnixNano(), 10)
bucket, found := buckets[bKey]
if !found {
mn := vsanPerfMetricsName + e.parent.Separator + formattedEntityName
bucket = metricEntry{name: mn, ts: ts, fields: make(map[string]interface{}), tags: tags}
buckets[bKey] = bucket
}
if v, err := strconv.ParseFloat(values, 32); err == nil {
bucket.fields[metricLabel] = v
}
}
}
if len(timeStamps) > 0 {
lastSample := timeStamps[len(timeStamps)-1]
if !lastSample.IsZero() && lastSample.After(latest) {
latest = lastSample
}
}
// We've iterated through all the metrics and collected buckets for each measurement name. Now emit them!
for _, bucket := range buckets {
acc.AddFields(bucket.name, bucket.fields, bucket.tags, bucket.ts)
}
count += len(buckets)
}
}
e.hwMarks.put(hwMarksKeyPrefix+clusterRef.ref.Value, "generic", latest)
return commonError
}
// queryDiskUsage adds 'FreeCapacityB' and 'TotalCapacityB' metrics to telegraf accumulator
func (e *endpoint) queryDiskUsage(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
spaceManagerRef := types.ManagedObjectReference{
Type: "VsanSpaceReportSystem",
Value: "vsan-cluster-space-report-system",
}
resp, err := vsanmethods.VsanQuerySpaceUsage(ctx, vsanClient,
&vsantypes.VsanQuerySpaceUsage{
This: spaceManagerRef,
Cluster: clusterRef.ref,
})
if err != nil {
return err
}
fields := map[string]interface{}{
"free_capacity_byte": resp.Returnval.FreeCapacityB,
"total_capacity_byte": resp.Returnval.TotalCapacityB,
}
tags := populateClusterTags(make(map[string]string), clusterRef, e.url.Host)
acc.AddFields(vsanSummaryMetricsName, fields, tags)
return nil
}
// queryHealthSummary adds 'OverallHealth' metric to telegraf accumulator
func (e *endpoint) queryHealthSummary(ctx context.Context, vsanClient *soap.Client, clusterRef *objectRef, acc telegraf.Accumulator) error {
healthSystemRef := types.ManagedObjectReference{
Type: "VsanVcClusterHealthSystem",
Value: "vsan-cluster-health-system",
}
fetchFromCache := true
resp, err := vsanmethods.VsanQueryVcClusterHealthSummary(ctx, vsanClient,
&vsantypes.VsanQueryVcClusterHealthSummary{
This: healthSystemRef,
Cluster: &clusterRef.ref,
Fields: []string{"overallHealth", "overallHealthDescription"},
FetchFromCache: &fetchFromCache,
})
if err != nil {
return err
}
healthStr := resp.Returnval.OverallHealth
healthMap := map[string]int{"red": 2, "yellow": 1, "green": 0}
fields := make(map[string]interface{})
if val, ok := healthMap[healthStr]; ok {
fields["overall_health"] = val
}
tags := populateClusterTags(make(map[string]string), clusterRef, e.url.Host)
acc.AddFields(vsanSummaryMetricsName, fields, tags)
return nil
}
// queryResyncSummary adds resync information to accumulator
func (e *endpoint) queryResyncSummary(ctx context.Context, vsanClient *soap.Client, clusterObj *object.ClusterComputeResource,
clusterRef *objectRef, acc telegraf.Accumulator) error {
if lower := versionLowerThan(e.apiVersion, 6, 7); lower {
e.parent.Log.Infof("I! [inputs.vsphere][vSAN] Minimum API Version 6.7 required for resync summary. Found: %s. Skipping VCenter: %s",
e.apiVersion, e.url.Host)
return nil
}
hosts, err := clusterObj.Hosts(ctx)
if err != nil {
return err
}
if len(hosts) == 0 {
return nil
}
hostRefValue := hosts[0].Reference().Value
hostRefValueParts := strings.Split(hostRefValue, "-")
if len(hostRefValueParts) != 2 {
e.parent.Log.Errorf("[vSAN] Host reference value does not match expected pattern: host-<num>. Actual Value %s", hostRefValue)
return err
}
vsanSystemEx := types.ManagedObjectReference{
Type: "VsanSystemEx",
Value: "vsanSystemEx-" + strings.Split(hostRefValue, "-")[1],
}
includeSummary := true
request := vsantypes.VsanQuerySyncingVsanObjects{
This: vsanSystemEx,
Uuids: make([]string, 0), // We only need summary information.
Start: 0,
IncludeSummary: &includeSummary,
}
resp, err := vsanmethods.VsanQuerySyncingVsanObjects(ctx, vsanClient, &request)
if err != nil {
return err
}
fields := make(map[string]interface{})
fields["total_bytes_to_sync"] = resp.Returnval.TotalBytesToSync
fields["total_objects_to_sync"] = resp.Returnval.TotalObjectsToSync
fields["total_recovery_eta"] = resp.Returnval.TotalRecoveryETA
tags := populateClusterTags(make(map[string]string), clusterRef, e.url.Host)
acc.AddFields(vsanSummaryMetricsName, fields, tags)
return nil
}
// populateClusterTags takes in a tag map, makes a copy, populates cluster related tags and returns the copy.
func populateClusterTags(tags map[string]string, clusterRef *objectRef, vcenter string) map[string]string {
newTags := make(map[string]string)
// deep copy
for k, v := range tags {
newTags[k] = v
}
newTags["vcenter"] = vcenter
newTags["dcname"] = clusterRef.dcname
newTags["clustername"] = clusterRef.name
newTags["moid"] = clusterRef.ref.Value
newTags["source"] = clusterRef.name
return newTags
}
// populateCMMDSTags takes in a tag map, makes a copy, adds more tags using a cmmds map and returns the copy.
func populateCMMDSTags(tags map[string]string, entityName, uuid string, cmmds map[string]cmmdsEntity) map[string]string {
newTags := make(map[string]string)
// deep copy
for k, v := range tags {
newTags[k] = v
}
// There are cases when the uuid is missing. (Usually happens when performance service is just enabled or disabled)
// We need this check to avoid index-out-of-range error
if uuid == "*" || uuid == "" {
return newTags
}
// Add additional tags based on CMMDS data
switch {
case strings.Contains(entityName, "-disk") || strings.Contains(entityName, "disk-"):
if e, ok := cmmds[uuid]; ok {
if host, ok := cmmds[e.Owner]; ok {
newTags["hostname"] = host.Content.Hostname
}
newTags["devicename"] = e.Content.DevName
// Skip adding ssduuid tag for VSAN ESA disks as this property is not returned in the CMMDS data
if !(strings.Contains(entityName, "-esa-")) {
if int(e.Content.IsSsd) == 0 {
newTags["ssduuid"] = e.Content.SsdUUID
}
}
}
case strings.Contains(entityName, "host-memory-"):
memInfo := strings.Split(uuid, "|")
if strings.Contains(entityName, "-slab") && len(memInfo) > 1 {
newTags["slabname"] = memInfo[1]
}
if strings.Contains(entityName, "-heap") && len(memInfo) > 1 {
newTags["heapname"] = memInfo[1]
}
if e, ok := cmmds[memInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "host-") || strings.Contains(entityName, "system-mem"):
if e, ok := cmmds[uuid]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "vnic-net"):
nicInfo := strings.Split(uuid, "|")
if len(nicInfo) > 2 {
newTags["stackname"] = nicInfo[1]
newTags["vnic"] = nicInfo[2]
}
if e, ok := cmmds[nicInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "pnic-net"):
nicInfo := strings.Split(uuid, "|")
if len(nicInfo) > 1 {
newTags["pnic"] = nicInfo[1]
}
if e, ok := cmmds[nicInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
case strings.Contains(entityName, "world-cpu"):
cpuInfo := strings.Split(uuid, "|")
if len(cpuInfo) > 1 {
newTags["worldname"] = cpuInfo[1]
}
if e, ok := cmmds[cpuInfo[0]]; ok {
newTags["hostname"] = e.Content.Hostname
}
default:
// If no tags are added in previous steps, we add uuid for it
if len(newTags) == len(tags) {
newTags["uuid"] = uuid
}
}
return newTags
}
// versionLowerThan returns true is the current version < a base version
func versionLowerThan(current string, major, minor int) bool {
version := strings.Split(current, ".")
currentMajor, err := strconv.Atoi(version[0])
if err != nil {
return false
}
if currentMajor > major {
return false
}
if currentMajor == major {
if len(version) < 2 {
return true
}
currentMinor, err := strconv.Atoi(version[1])
if err != nil {
return true
}
if currentMinor >= minor {
return false
}
}
return true
}
type cmmdsEntity struct {
UUID string `json:"uuid"`
Owner string `json:"owner"` // ESXi UUID
Type string `json:"type"`
Content cmmdsContent `json:"content"`
}
type cmmds struct {
Res []cmmdsEntity `json:"result"`
}
type cmmdsContent struct {
Hostname string `json:"hostname"`
IsSsd float64 `json:"isSsd"`
SsdUUID string `json:"ssdUuid"`
DevName string `json:"devName"`
}

View file

@ -0,0 +1,181 @@
//go:generate ../../../tools/readme_config_includer/generator
package vsphere
import (
"context"
_ "embed"
"errors"
"sync"
"time"
"github.com/vmware/govmomi/vim25/soap"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/plugins/common/proxy"
"github.com/influxdata/telegraf/plugins/common/tls"
"github.com/influxdata/telegraf/plugins/inputs"
)
//go:embed sample.conf
var sampleConfig string
type VSphere struct {
Vcenters []string `toml:"vcenters"`
Username config.Secret `toml:"username"`
Password config.Secret `toml:"password"`
DatacenterInstances bool `toml:"datacenter_instances"`
DatacenterMetricInclude []string `toml:"datacenter_metric_include"`
DatacenterMetricExclude []string `toml:"datacenter_metric_exclude"`
DatacenterInclude []string `toml:"datacenter_include"`
DatacenterExclude []string `toml:"datacenter_exclude"`
ClusterInstances bool `toml:"cluster_instances"`
ClusterMetricInclude []string `toml:"cluster_metric_include"`
ClusterMetricExclude []string `toml:"cluster_metric_exclude"`
ClusterInclude []string `toml:"cluster_include"`
ClusterExclude []string `toml:"cluster_exclude"`
ResourcePoolInstances bool `toml:"resource_pool_instances"`
ResourcePoolMetricInclude []string `toml:"resource_pool_metric_include"`
ResourcePoolMetricExclude []string `toml:"resource_pool_metric_exclude"`
ResourcePoolInclude []string `toml:"resource_pool_include"`
ResourcePoolExclude []string `toml:"resource_pool_exclude"`
HostInstances bool `toml:"host_instances"`
HostMetricInclude []string `toml:"host_metric_include"`
HostMetricExclude []string `toml:"host_metric_exclude"`
HostInclude []string `toml:"host_include"`
HostExclude []string `toml:"host_exclude"`
VMInstances bool `toml:"vm_instances"`
VMMetricInclude []string `toml:"vm_metric_include"`
VMMetricExclude []string `toml:"vm_metric_exclude"`
VMInclude []string `toml:"vm_include"`
VMExclude []string `toml:"vm_exclude"`
DatastoreInstances bool `toml:"datastore_instances"`
DatastoreMetricInclude []string `toml:"datastore_metric_include"`
DatastoreMetricExclude []string `toml:"datastore_metric_exclude"`
DatastoreInclude []string `toml:"datastore_include"`
DatastoreExclude []string `toml:"datastore_exclude"`
VSANMetricInclude []string `toml:"vsan_metric_include"`
VSANMetricExclude []string `toml:"vsan_metric_exclude"`
VSANMetricSkipVerify bool `toml:"vsan_metric_skip_verify"`
VSANClusterInclude []string `toml:"vsan_cluster_include"`
VSANInterval config.Duration `toml:"vsan_interval"`
Separator string `toml:"separator"`
CustomAttributeInclude []string `toml:"custom_attribute_include"`
CustomAttributeExclude []string `toml:"custom_attribute_exclude"`
UseIntSamples bool `toml:"use_int_samples"`
IPAddresses []string `toml:"ip_addresses"`
MetricLookback int `toml:"metric_lookback"`
DisconnectedServersBehavior string `toml:"disconnected_servers_behavior"`
MaxQueryObjects int `toml:"max_query_objects"`
MaxQueryMetrics int `toml:"max_query_metrics"`
CollectConcurrency int `toml:"collect_concurrency"`
DiscoverConcurrency int `toml:"discover_concurrency"`
ForceDiscoverOnInit bool `toml:"force_discover_on_init" deprecated:"1.14.0;1.35.0;option is ignored"`
ObjectDiscoveryInterval config.Duration `toml:"object_discovery_interval"`
Timeout config.Duration `toml:"timeout"`
HistoricalInterval config.Duration `toml:"historical_interval"`
Log telegraf.Logger `toml:"-"`
tls.ClientConfig // Mix in the TLS/SSL goodness from core
proxy.HTTPProxy
endpoints []*endpoint
cancel context.CancelFunc
}
func (*VSphere) SampleConfig() string {
return sampleConfig
}
func (v *VSphere) Start(_ telegraf.Accumulator) error {
v.Log.Info("Starting plugin")
ctx, cancel := context.WithCancel(context.Background())
v.cancel = cancel
// Create endpoints, one for each vCenter we're monitoring
v.endpoints = make([]*endpoint, 0, len(v.Vcenters))
for _, rawURL := range v.Vcenters {
u, err := soap.ParseURL(rawURL)
if err != nil {
return err
}
ep, err := newEndpoint(ctx, v, u, v.Log)
if err != nil {
return err
}
v.endpoints = append(v.endpoints, ep)
}
return nil
}
func (v *VSphere) Gather(acc telegraf.Accumulator) error {
var wg sync.WaitGroup
for _, ep := range v.endpoints {
wg.Add(1)
go func(endpoint *endpoint) {
defer wg.Done()
err := endpoint.collect(context.Background(), acc)
if errors.Is(err, context.Canceled) {
// No need to signal errors if we were merely canceled.
err = nil
}
if err != nil {
acc.AddError(err)
}
}(ep)
}
wg.Wait()
return nil
}
func (v *VSphere) Stop() {
v.Log.Info("Stopping plugin")
v.cancel()
// Wait for all endpoints to finish. No need to wait for
// Gather() to finish here, since it Stop() will only be called
// after the last Gather() has finished. We do, however, need to
// wait for any discovery to complete by trying to grab the
// "busy" mutex.
for _, ep := range v.endpoints {
v.Log.Debugf("Waiting for endpoint %q to finish", ep.url.Host)
func() {
ep.busy.Lock() // Wait until discovery is finished
defer ep.busy.Unlock()
ep.close()
}()
}
}
func init() {
inputs.Add("vsphere", func() telegraf.Input {
return &VSphere{
DatacenterInclude: []string{"/*"},
ClusterInclude: []string{"/*/host/**"},
HostInstances: true,
HostInclude: []string{"/*/host/**"},
ResourcePoolInclude: []string{"/*/host/**"},
VMInstances: true,
VMInclude: []string{"/*/vm/**"},
DatastoreInclude: []string{"/*/datastore/**"},
VSANMetricExclude: []string{"*"},
VSANClusterInclude: []string{"/*/host/**"},
Separator: "_",
CustomAttributeExclude: []string{"*"},
UseIntSamples: true,
MaxQueryObjects: 256,
MaxQueryMetrics: 256,
CollectConcurrency: 1,
DiscoverConcurrency: 1,
MetricLookback: 3,
ForceDiscoverOnInit: true,
ObjectDiscoveryInterval: config.Duration(time.Second * 300),
Timeout: config.Duration(time.Second * 60),
HistoricalInterval: config.Duration(time.Second * 300),
VSANInterval: config.Duration(time.Second * 300),
DisconnectedServersBehavior: "error",
HTTPProxy: proxy.HTTPProxy{UseSystemProxy: true},
}
})
}

View file

@ -0,0 +1,615 @@
package vsphere
import (
"context"
"crypto/tls"
"net/url"
"os"
"strings"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/vmware/govmomi/object"
"github.com/vmware/govmomi/simulator"
"github.com/vmware/govmomi/vim25/mo"
"github.com/vmware/govmomi/vim25/types"
"github.com/influxdata/telegraf/config"
common_tls "github.com/influxdata/telegraf/plugins/common/tls"
"github.com/influxdata/telegraf/testutil"
)
func defaultVSphere() *VSphere {
return &VSphere{
Log: testutil.Logger{},
ClusterMetricInclude: []string{
"cpu.usage.*",
"cpu.usagemhz.*",
"mem.usage.*",
"mem.active.*"},
ClusterMetricExclude: nil,
ClusterInclude: []string{"/**"},
HostMetricInclude: []string{
"cpu.coreUtilization.average",
"cpu.costop.summation",
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.swapwait.summation",
"cpu.usage.average",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.utilization.average",
"cpu.wait.summation",
"disk.deviceReadLatency.average",
"disk.deviceWriteLatency.average",
"disk.kernelReadLatency.average",
"disk.kernelWriteLatency.average",
"disk.numberReadAveraged.average",
"disk.numberWriteAveraged.average",
"disk.read.average",
"disk.totalReadLatency.average",
"disk.totalWriteLatency.average",
"disk.write.average",
"mem.active.average",
"mem.latency.average",
"mem.state.latest",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.totalCapacity.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.errorsRx.summation",
"net.errorsTx.summation",
"net.usage.average",
"power.power.average",
"storageAdapter.numberReadAveraged.average",
"storageAdapter.numberWriteAveraged.average",
"storageAdapter.read.average",
"storageAdapter.write.average",
"sys.uptime.latest"},
HostMetricExclude: nil,
HostInclude: []string{"/**"},
VMMetricInclude: []string{
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.run.summation",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.wait.summation",
"mem.active.average",
"mem.granted.average",
"mem.latency.average",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.usage.average",
"power.power.average",
"virtualDisk.numberReadAveraged.average",
"virtualDisk.numberWriteAveraged.average",
"virtualDisk.read.average",
"virtualDisk.readOIO.latest",
"virtualDisk.throughput.usage.average",
"virtualDisk.totalReadLatency.average",
"virtualDisk.totalWriteLatency.average",
"virtualDisk.write.average",
"virtualDisk.writeOIO.latest",
"sys.uptime.latest"},
VMMetricExclude: nil,
VMInclude: []string{"/**"},
DatastoreMetricInclude: []string{
"disk.used.*",
"disk.provisioned.*"},
DatastoreMetricExclude: nil,
DatastoreInclude: []string{"/**"},
ResourcePoolMetricInclude: []string{
"cpu.capacity.*",
"mem.capacity.*"},
ResourcePoolMetricExclude: nil,
ResourcePoolInclude: []string{"/**"},
DatacenterMetricInclude: nil,
DatacenterMetricExclude: nil,
DatacenterInclude: []string{"/**"},
ClientConfig: common_tls.ClientConfig{InsecureSkipVerify: true},
MaxQueryObjects: 256,
MaxQueryMetrics: 256,
ObjectDiscoveryInterval: config.Duration(time.Second * 300),
Timeout: config.Duration(time.Second * 20),
ForceDiscoverOnInit: true,
DiscoverConcurrency: 1,
CollectConcurrency: 1,
Separator: ".",
HistoricalInterval: config.Duration(time.Second * 300),
}
}
func createSim(folders int) (*simulator.Model, *simulator.Server, error) {
model := simulator.VPX()
model.Folder = folders
model.Datacenter = 2
// model.App = 1
err := model.Create()
if err != nil {
return nil, nil, err
}
model.Service.TLS = new(tls.Config)
s := model.Service.NewServer()
return model, s, nil
}
func testAlignUniform(t *testing.T, n int) {
now := time.Now().Truncate(60 * time.Second)
info := make([]types.PerfSampleInfo, 0, n)
values := make([]int64, 0, n)
for i := 0; i < n; i++ {
info = append(info, types.PerfSampleInfo{
Timestamp: now.Add(time.Duration(20*i) * time.Second),
Interval: 20,
})
values = append(values, 1)
}
e := endpoint{log: testutil.Logger{}}
newInfo, newValues := e.alignSamples(info, values, 60*time.Second)
require.Len(t, newInfo, n/3, "Aligned infos have wrong size")
require.Len(t, newValues, n/3, "Aligned values have wrong size")
for _, v := range newValues {
require.InDelta(t, 1.0, v, testutil.DefaultDelta, "Aligned value should be 1")
}
}
func TestAlignMetrics(t *testing.T) {
testAlignUniform(t, 3)
testAlignUniform(t, 30)
testAlignUniform(t, 333)
// 20s to 60s of 1,2,3,1,2,3... (should average to 2)
n := 30
now := time.Now().Truncate(60 * time.Second)
info := make([]types.PerfSampleInfo, 0, n)
values := make([]int64, 0, n)
for i := 0; i < n; i++ {
info = append(info, types.PerfSampleInfo{
Timestamp: now.Add(time.Duration(20*i) * time.Second),
Interval: 20,
})
values = append(values, int64(i%3+1))
}
e := endpoint{log: testutil.Logger{}}
newInfo, newValues := e.alignSamples(info, values, 60*time.Second)
require.Len(t, newInfo, n/3, "Aligned infos have wrong size")
require.Len(t, newValues, n/3, "Aligned values have wrong size")
for _, v := range newValues {
require.InDelta(t, 2.0, v, testutil.DefaultDelta, "Aligned value should be 2")
}
}
func TestConfigDurationParsing(t *testing.T) {
v := defaultVSphere()
require.Equal(t, int32(300), int32(time.Duration(v.HistoricalInterval).Seconds()), "HistoricalInterval.Seconds() with default duration should resolve 300")
}
func TestMaxQuery(t *testing.T) {
if testing.Short() {
t.Skip("Skipping long test in short mode")
}
m, s, err := createSim(0)
require.NoError(t, err)
defer m.Remove()
defer s.Close()
v := defaultVSphere()
v.MaxQueryMetrics = 256
c, err := newClient(t.Context(), s.URL, v)
require.NoError(t, err)
require.Equal(t, 256, v.MaxQueryMetrics)
om := object.NewOptionManager(c.client.Client, *c.client.Client.ServiceContent.Setting)
err = om.Update(t.Context(), []types.BaseOptionValue{&types.OptionValue{
Key: "config.vpxd.stats.maxQueryMetrics",
Value: "42",
}})
require.NoError(t, err)
v.MaxQueryMetrics = 256
c2, err := newClient(t.Context(), s.URL, v)
require.NoError(t, err)
require.Equal(t, 42, v.MaxQueryMetrics)
c.close()
c2.close()
}
func testLookupVM(ctx context.Context, t *testing.T, f *finder, path string, expected int, expectedName string) {
poweredOn := types.VirtualMachinePowerState("poweredOn")
var vm []mo.VirtualMachine
err := f.find(ctx, "VirtualMachine", path, &vm)
require.NoError(t, err)
require.Len(t, vm, expected)
if expectedName != "" {
require.Equal(t, expectedName, vm[0].Name)
}
for i := range vm {
v := &vm[i]
require.Equal(t, poweredOn, v.Runtime.PowerState)
}
}
func TestFinder(t *testing.T) {
if testing.Short() {
t.Skip("Skipping long test in short mode")
}
m, s, err := createSim(0)
require.NoError(t, err)
defer m.Remove()
defer s.Close()
v := defaultVSphere()
c, err := newClient(t.Context(), s.URL, v)
require.NoError(t, err)
f := finder{c}
var dc []mo.Datacenter
err = f.find(t.Context(), "Datacenter", "/DC0", &dc)
require.NoError(t, err)
require.Len(t, dc, 1)
require.Equal(t, "DC0", dc[0].Name)
var host []mo.HostSystem
err = f.find(t.Context(), "HostSystem", "/DC0/host/DC0_H0/DC0_H0", &host)
require.NoError(t, err)
require.Len(t, host, 1)
require.Equal(t, "DC0_H0", host[0].Name)
host = make([]mo.HostSystem, 0)
err = f.find(t.Context(), "HostSystem", "/DC0/host/DC0_C0/DC0_C0_H0", &host)
require.NoError(t, err)
require.Len(t, host, 1)
require.Equal(t, "DC0_C0_H0", host[0].Name)
resourcepool := make([]mo.ResourcePool, 0)
err = f.find(t.Context(), "ResourcePool", "/DC0/host/DC0_C0/Resources/DC0_C0_RP0", &resourcepool)
require.NoError(t, err)
require.Len(t, host, 1)
require.Equal(t, "DC0_C0_H0", host[0].Name)
host = make([]mo.HostSystem, 0)
err = f.find(t.Context(), "HostSystem", "/DC0/host/DC0_C0/*", &host)
require.NoError(t, err)
require.Len(t, host, 3)
var vm []mo.VirtualMachine
testLookupVM(t.Context(), t, &f, "/DC0/vm/DC0_H0_VM0", 1, "")
testLookupVM(t.Context(), t, &f, "/DC0/vm/DC0_C0*", 2, "")
testLookupVM(t.Context(), t, &f, "/DC0/*/DC0_H0_VM0", 1, "DC0_H0_VM0")
testLookupVM(t.Context(), t, &f, "/DC0/*/DC0_H0_*", 2, "")
testLookupVM(t.Context(), t, &f, "/DC0/**/DC0_H0_VM*", 2, "")
testLookupVM(t.Context(), t, &f, "/DC0/**", 4, "")
testLookupVM(t.Context(), t, &f, "/DC1/**", 4, "")
testLookupVM(t.Context(), t, &f, "/**", 8, "")
testLookupVM(t.Context(), t, &f, "/**/vm/**", 8, "")
testLookupVM(t.Context(), t, &f, "/*/host/**/*DC*", 8, "")
testLookupVM(t.Context(), t, &f, "/*/host/**/*DC*VM*", 8, "")
testLookupVM(t.Context(), t, &f, "/*/host/**/*DC*/*/*DC*", 4, "")
vm = make([]mo.VirtualMachine, 0)
err = f.findAll(t.Context(), "VirtualMachine", []string{"/DC0/vm/DC0_H0*", "/DC0/vm/DC0_C0*"}, nil, &vm)
require.NoError(t, err)
require.Len(t, vm, 4)
rf := resourceFilter{
finder: &f,
paths: []string{"/DC0/vm/DC0_H0*", "/DC0/vm/DC0_C0*"},
excludePaths: []string{"/DC0/vm/DC0_H0_VM0"},
resType: "VirtualMachine",
}
vm = make([]mo.VirtualMachine, 0)
require.NoError(t, rf.findAll(t.Context(), &vm))
require.Len(t, vm, 3)
rf = resourceFilter{
finder: &f,
paths: []string{"/DC0/vm/DC0_H0*", "/DC0/vm/DC0_C0*"},
excludePaths: []string{"/**"},
resType: "VirtualMachine",
}
vm = make([]mo.VirtualMachine, 0)
require.NoError(t, rf.findAll(t.Context(), &vm))
require.Empty(t, vm)
rf = resourceFilter{
finder: &f,
paths: []string{"/**"},
excludePaths: []string{"/**"},
resType: "VirtualMachine",
}
vm = make([]mo.VirtualMachine, 0)
require.NoError(t, rf.findAll(t.Context(), &vm))
require.Empty(t, vm)
rf = resourceFilter{
finder: &f,
paths: []string{"/**"},
excludePaths: []string{"/this won't match anything"},
resType: "VirtualMachine",
}
vm = make([]mo.VirtualMachine, 0)
require.NoError(t, rf.findAll(t.Context(), &vm))
require.Len(t, vm, 8)
rf = resourceFilter{
finder: &f,
paths: []string{"/**"},
excludePaths: []string{"/**/*VM0"},
resType: "VirtualMachine",
}
vm = make([]mo.VirtualMachine, 0)
require.NoError(t, rf.findAll(t.Context(), &vm))
require.Len(t, vm, 4)
}
func TestFolders(t *testing.T) {
if testing.Short() {
t.Skip("Skipping long test in short mode")
}
m, s, err := createSim(1)
require.NoError(t, err)
defer m.Remove()
defer s.Close()
v := defaultVSphere()
c, err := newClient(t.Context(), s.URL, v)
require.NoError(t, err)
f := finder{c}
var folder []mo.Folder
err = f.find(t.Context(), "Folder", "/F0", &folder)
require.NoError(t, err)
require.Len(t, folder, 1)
require.Equal(t, "F0", folder[0].Name)
var dc []mo.Datacenter
err = f.find(t.Context(), "Datacenter", "/F0/DC1", &dc)
require.NoError(t, err)
require.Len(t, dc, 1)
require.Equal(t, "DC1", dc[0].Name)
testLookupVM(t.Context(), t, &f, "/F0/DC0/vm/**/F*", 0, "")
testLookupVM(t.Context(), t, &f, "/F0/DC1/vm/**/F*/*VM*", 4, "")
testLookupVM(t.Context(), t, &f, "/F0/DC1/vm/**/F*/**", 4, "")
}
func TestVsanCmmds(t *testing.T) {
m, s, err := createSim(0)
require.NoError(t, err)
defer m.Remove()
defer s.Close()
v := defaultVSphere()
c, err := newClient(t.Context(), s.URL, v)
require.NoError(t, err)
f := finder{c}
var clusters []mo.ClusterComputeResource
err = f.findAll(t.Context(), "ClusterComputeResource", []string{"/**"}, nil, &clusters)
require.NoError(t, err)
clusterObj := object.NewClusterComputeResource(c.client.Client, clusters[0].Reference())
_, err = getCmmdsMap(t.Context(), c.client.Client, clusterObj)
require.Error(t, err)
}
func TestVsanTags(t *testing.T) {
host := "5b860329-3bc4-a76c-48b6-246e963cfcc0"
disk := "52ee3be1-47cc-b50d-ecab-01af0f706381"
ssdDisk := "52f26fc8-0b9b-56d8-3a32-a9c3bfbc6148"
nvmeDisk := "5291e74f-74d3-fca2-6ffa-3655657dd3be"
ssd := "52173131-3384-bb63-4ef8-c00b0ce7e3e7"
hostname := "sc2-hs1-b2801.eng.vmware.com"
devName := "naa.55cd2e414d82c815:2"
var cmmds = map[string]cmmdsEntity{
nvmeDisk: {UUID: nvmeDisk, Type: "DISK_CAPACITY_TIER", Owner: host, Content: cmmdsContent{DevName: devName}},
disk: {UUID: disk, Type: "DISK", Owner: host, Content: cmmdsContent{DevName: devName, IsSsd: 1.}},
ssdDisk: {UUID: ssdDisk, Type: "DISK", Owner: host, Content: cmmdsContent{DevName: devName, IsSsd: 0., SsdUUID: ssd}},
host: {UUID: host, Type: "HOSTNAME", Owner: host, Content: cmmdsContent{Hostname: hostname}},
}
tags := populateCMMDSTags(make(map[string]string), "capacity-disk", disk, cmmds)
require.Len(t, tags, 2)
tags = populateCMMDSTags(make(map[string]string), "cache-disk", ssdDisk, cmmds)
require.Len(t, tags, 3)
tags = populateCMMDSTags(make(map[string]string), "host-domclient", host, cmmds)
require.Len(t, tags, 1)
tags = populateCMMDSTags(make(map[string]string), "vsan-esa-disk-layer", nvmeDisk, cmmds)
require.Len(t, tags, 2)
}
func TestCollectionNoClusterMetrics(t *testing.T) {
if testing.Short() {
t.Skip("Skipping long test in short mode")
}
testCollection(t, true)
}
func TestDisconnectedServerBehavior(t *testing.T) {
u, err := url.Parse("https://definitely.not.a.valid.host")
require.NoError(t, err)
v := defaultVSphere()
v.DisconnectedServersBehavior = "error"
_, err = newEndpoint(t.Context(), v, u, v.Log)
require.Error(t, err)
v.DisconnectedServersBehavior = "ignore"
_, err = newEndpoint(t.Context(), v, u, v.Log)
require.NoError(t, err)
v.DisconnectedServersBehavior = "something else"
_, err = newEndpoint(t.Context(), v, u, v.Log)
require.Error(t, err)
require.Equal(t, `"something else" is not a valid value for disconnected_servers_behavior`, err.Error())
}
func testCollection(t *testing.T, excludeClusters bool) {
mustHaveMetrics := map[string]struct{}{
"vsphere.vm.cpu": {},
"vsphere.vm.mem": {},
"vsphere.vm.net": {},
"vsphere.host.cpu": {},
"vsphere.host.mem": {},
"vsphere.host.net": {},
"vsphere.datastore.disk": {},
}
vCenter := os.Getenv("VCENTER_URL")
username := os.Getenv("VCENTER_USER")
password := os.Getenv("VCENTER_PASSWORD")
v := defaultVSphere()
if vCenter != "" {
v.Vcenters = []string{vCenter}
v.Username = config.NewSecret([]byte(username))
v.Password = config.NewSecret([]byte(password))
} else {
m, s, err := createSim(0)
require.NoError(t, err)
defer m.Remove()
defer s.Close()
v.Vcenters = []string{s.URL.String()}
}
if excludeClusters {
v.ClusterMetricExclude = []string{"*"}
}
var acc testutil.Accumulator
require.NoError(t, v.Start(&acc))
defer v.Stop()
require.NoError(t, v.Gather(&acc))
require.Emptyf(t, acc.Errors, "Errors found: %s", acc.Errors)
require.NotEmpty(t, acc.Metrics, "No metrics were collected")
cache := make(map[string]string)
client, err := v.endpoints[0].clientFactory.getClient(t.Context())
require.NoError(t, err)
hostCache := make(map[string]string)
for _, m := range acc.Metrics {
delete(mustHaveMetrics, m.Measurement)
if strings.HasPrefix(m.Measurement, "vsphere.vm.") {
mustContainAll(t, m.Tags, []string{"esxhostname", "moid", "vmname", "guest", "dcname", "uuid", "vmname"})
hostName := m.Tags["esxhostname"]
hostMoid, ok := hostCache[hostName]
if !ok {
// We have to follow the host parent path to locate a cluster. Look up the host!
finder := finder{client}
var hosts []mo.HostSystem
err := finder.find(t.Context(), "HostSystem", "/**/"+hostName, &hosts)
require.NoError(t, err)
require.NotEmpty(t, hosts)
hostMoid = hosts[0].Reference().Value
hostCache[hostName] = hostMoid
}
if isInCluster(t, v, client, cache, "HostSystem", hostMoid) { // If the VM lives in a cluster
mustContainAll(t, m.Tags, []string{"clustername"})
}
} else if strings.HasPrefix(m.Measurement, "vsphere.host.") {
if isInCluster(t, v, client, cache, "HostSystem", m.Tags["moid"]) { // If the host lives in a cluster
mustContainAll(t, m.Tags, []string{"esxhostname", "clustername", "moid", "dcname"})
} else {
mustContainAll(t, m.Tags, []string{"esxhostname", "moid", "dcname"})
}
} else if strings.HasPrefix(m.Measurement, "vsphere.cluster.") {
mustContainAll(t, m.Tags, []string{"clustername", "moid", "dcname"})
} else {
mustContainAll(t, m.Tags, []string{"moid", "dcname"})
}
}
require.Empty(t, mustHaveMetrics, "Some metrics were not found")
}
func isInCluster(t *testing.T, v *VSphere, client *client, cache map[string]string, resourceKind, moid string) bool {
ref := types.ManagedObjectReference{
Type: resourceKind,
Value: moid,
}
_, ok := v.endpoints[0].getAncestorName(t.Context(), client, "ClusterComputeResource", cache, ref)
return ok
}
func mustContainAll(t *testing.T, tagMap map[string]string, mustHave []string) {
for _, tag := range mustHave {
require.Contains(t, tagMap, tag)
}
}
func TestVersionLowerThan(t *testing.T) {
tests := []struct {
current string
major int
minor int
result bool
}{
{
current: "7",
major: 6,
minor: 3,
result: false,
},
{
current: "5",
major: 6,
minor: 3,
result: true,
},
{
current: "6.0",
major: 6,
minor: 3,
result: true,
},
{
current: "6.3",
major: 6,
minor: 3,
result: false,
},
{
current: "6.2",
major: 6,
minor: 3,
result: true,
},
{
current: "7.0.3.0",
major: 6,
minor: 7,
result: false,
},
}
for _, tc := range tests {
result := versionLowerThan(tc.current, tc.major, tc.minor)
require.Equalf(t, tc.result, result, "%s < %d.%d", tc.current, tc.major, tc.minor)
}
}