1
0
Fork 0

Adding upstream version 1.34.4.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-24 07:26:29 +02:00
parent e393c3af3f
commit 4978089aab
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
4963 changed files with 677545 additions and 0 deletions

View file

@ -0,0 +1,316 @@
# StatsD Input Plugin
The StatsD input plugin gathers metrics from a Statsd server.
## Service Input <!-- @/docs/includes/service_input.md -->
This plugin is a service input. Normal plugins gather metrics determined by the
interval setting. Service plugins start a service to listen and wait for
metrics or events to occur. Service plugins have two key differences from
normal plugins:
1. The global or plugin specific `interval` setting may not apply
2. The CLI options of `--test`, `--test-wait`, and `--once` may not produce
output for this plugin
## Global configuration options <!-- @/docs/includes/plugin_config.md -->
In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
## Configuration
```toml @sample.conf
# Statsd Server
[[inputs.statsd]]
## Protocol, must be "tcp", "udp4", "udp6" or "udp" (default=udp)
protocol = "udp"
## MaxTCPConnection - applicable when protocol is set to tcp (default=250)
max_tcp_connections = 250
## Enable TCP keep alive probes (default=false)
tcp_keep_alive = false
## Specifies the keep-alive period for an active network connection.
## Only applies to TCP sockets and will be ignored if tcp_keep_alive is false.
## Defaults to the OS configuration.
# tcp_keep_alive_period = "2h"
## Address and port to host UDP listener on
service_address = ":8125"
## The following configuration options control when telegraf clears it's cache
## of previous values. If set to false, then telegraf will only clear it's
## cache when the daemon is restarted.
## Reset gauges every interval (default=true)
delete_gauges = true
## Reset counters every interval (default=true)
delete_counters = true
## Reset sets every interval (default=true)
delete_sets = true
## Reset timings & histograms every interval (default=true)
delete_timings = true
## Enable aggregation temporality adds temporality=delta or temporality=commulative tag, and
## start_time field, which adds the start time of the metric accumulation.
## You should use this when using OpenTelemetry output.
# enable_aggregation_temporality = false
## Percentiles to calculate for timing & histogram stats.
percentiles = [50.0, 90.0, 99.0, 99.9, 99.95, 100.0]
## separator to use between elements of a statsd metric
metric_separator = "_"
## Parses extensions to statsd in the datadog statsd format
## currently supports metrics and datadog tags.
## http://docs.datadoghq.com/guides/dogstatsd/
datadog_extensions = false
## Parses distributions metric as specified in the datadog statsd format
## https://docs.datadoghq.com/developers/metrics/types/?tab=distribution#definition
datadog_distributions = false
## Keep or drop the container id as tag. Included as optional field
## in DogStatsD protocol v1.2 if source is running in Kubernetes
## https://docs.datadoghq.com/developers/dogstatsd/datagram_shell/?tab=metrics#dogstatsd-protocol-v12
datadog_keep_container_tag = false
## Statsd data translation templates, more info can be read here:
## https://github.com/influxdata/telegraf/blob/master/docs/TEMPLATE_PATTERN.md
# templates = [
# "cpu.* measurement*"
# ]
## Number of UDP messages allowed to queue up, once filled,
## the statsd server will start dropping packets
allowed_pending_messages = 10000
## Number of worker threads used to parse the incoming messages.
# number_workers_threads = 5
## Number of timing/histogram values to track per-measurement in the
## calculation of percentiles. Raising this limit increases the accuracy
## of percentiles but also increases the memory usage and cpu time.
percentile_limit = 1000
## Maximum socket buffer size in bytes, once the buffer fills up, metrics
## will start dropping. Defaults to the OS default.
# read_buffer_size = 65535
## Max duration (TTL) for each metric to stay cached/reported without being updated.
# max_ttl = "10h"
## Sanitize name method
## By default, telegraf will pass names directly as they are received.
## However, upstream statsd now does sanitization of names which can be
## enabled by using the "upstream" method option. This option will a) replace
## white space with '_', replace '/' with '-', and remove characters not
## matching 'a-zA-Z_\-0-9\.;='.
#sanitize_name_method = ""
## Replace dots (.) with underscore (_) and dashes (-) with
## double underscore (__) in metric names.
# convert_names = false
## Convert all numeric counters to float
## Enabling this would ensure that both counters and guages are both emitted
## as floats.
# float_counters = false
## Emit timings `metric_<name>_count` field as float, the same as all other
## histogram fields
# float_timings = false
## Emit sets as float
# float_sets = false
```
## Description
The statsd plugin is a special type of plugin which runs a backgrounded statsd
listener service while telegraf is running.
The format of the statsd messages was based on the format described in the
original [etsy
statsd](https://github.com/etsy/statsd/blob/master/docs/metric_types.md)
implementation. In short, the telegraf statsd listener will accept:
- Gauges
- `users.current.den001.myapp:32|g` <- standard
- `users.current.den001.myapp:+10|g` <- additive
- `users.current.den001.myapp:-10|g`
- Counters
- `deploys.test.myservice:1|c` <- increments by 1
- `deploys.test.myservice:101|c` <- increments by 101
- `deploys.test.myservice:1|c|@0.1` <- with sample rate, increments by 10
- Sets
- `users.unique:101|s`
- `users.unique:101|s`
- `users.unique:102|s` <- would result in a count of 2 for `users.unique`
- Timings & Histograms
- `load.time:320|ms`
- `load.time.nanoseconds:1|h`
- `load.time:200|ms|@0.1` <- sampled 1/10 of the time
- Distributions
- `load.time:320|d`
- `load.time.nanoseconds:1|d`
- `load.time:200|d|@0.1` <- sampled 1/10 of the time
It is possible to omit repetitive names and merge individual stats into a
single line by separating them with additional colons:
- `users.current.den001.myapp:32|g:+10|g:-10|g`
- `deploys.test.myservice:1|c:101|c:1|c|@0.1`
- `users.unique:101|s:101|s:102|s`
- `load.time:320|ms:200|ms|@0.1`
This also allows for mixed types in a single line:
- `foo:1|c:200|ms`
The string `foo:1|c:200|ms` is internally split into two individual metrics
`foo:1|c` and `foo:200|ms` which are added to the aggregator separately.
## Influx Statsd
In order to take advantage of InfluxDB's tagging system, we have made a couple
additions to the standard statsd protocol. First, you can specify
tags in a manner similar to the line-protocol, like this:
```shell
users.current,service=payroll,region=us-west:32|g
```
<!-- TODO Second, you can specify multiple fields within a measurement:
```
current.users,service=payroll,server=host01:west=10,east=10,central=2,south=10|g
```
-->
## Metrics
Meta:
- tags: `metric_type=<gauge|set|counter|timing|histogram>`
Outputted measurements will depend entirely on the measurements that the user
sends, but here is a brief rundown of what you can expect to find from each
metric type:
- Gauges
- Gauges are a constant data type. They are not subject to averaging, and they
dont change unless you change them. That is, once you set a gauge value, it
will be a flat line on the graph until you change it again.
- Counters
- Counters are the most basic type. They are treated as a count of a type of
event. They will continually increase unless you set `delete_counters=true`.
- Sets
- Sets count the number of unique values passed to a key. For example, you
could count the number of users accessing your system using `users:<user_id>|s`.
No matter how many times the same user_id is sent, the count will only increase
by 1.
- Timings & Histograms
- Timers are meant to track how long something took. They are an invaluable
tool for tracking application performance.
- The following aggregate measurements are made for timers:
- `statsd_<name>_lower`: The lower bound is the lowest value statsd saw
for that stat during that interval.
- `statsd_<name>_upper`: The upper bound is the highest value statsd saw
for that stat during that interval.
- `statsd_<name>_mean`: The mean is the average of all values statsd saw
for that stat during that interval.
- `statsd_<name>_median`: The median is the middle of all values statsd saw
for that stat during that interval.
- `statsd_<name>_stddev`: The stddev is the sample standard deviation
of all values statsd saw for that stat during that interval.
- `statsd_<name>_sum`: The sum is the sample sum of all values statsd saw
for that stat during that interval.
- `statsd_<name>_count`: The count is the number of timings statsd saw
for that stat during that interval. It is not averaged.
- `statsd_<name>_percentile_<P>` The `Pth` percentile is a value x such
that `P%` of all the values statsd saw for that stat during that time
period are below x. The most common value that people use for `P` is the
`90`, this is a great number to try to optimize.
- Distributions
- The Distribution metric represents the global statistical distribution of a set of values calculated across your entire distributed infrastructure in one time interval. A Distribution can be used to instrument logical objects, like services, independently from the underlying hosts.
- Unlike the Histogram metric type, which aggregates on the Agent during a given time interval, a Distribution metric sends all the raw data during a time interval.
## Plugin arguments
- **protocol** string: Protocol used in listener - tcp or udp options
- **max_tcp_connections** []int: Maximum number of concurrent TCP connections
to allow. Used when protocol is set to tcp.
- **tcp_keep_alive** boolean: Enable TCP keep alive probes
- **tcp_keep_alive_period** duration: Specifies the keep-alive period for an active network connection
- **service_address** string: Address to listen for statsd UDP packets on
- **delete_gauges** boolean: Delete gauges on every collection interval
- **delete_counters** boolean: Delete counters on every collection interval
- **delete_sets** boolean: Delete set counters on every collection interval
- **delete_timings** boolean: Delete timings on every collection interval
- **percentiles** []int: Percentiles to calculate for timing & histogram stats
- **allowed_pending_messages** integer: Number of messages allowed to queue up
waiting to be processed. When this fills, messages will be dropped and logged.
- **percentile_limit** integer: Number of timing/histogram values to track
per-measurement in the calculation of percentiles. Raising this limit increases
the accuracy of percentiles but also increases the memory usage and cpu time.
- **templates** []string: Templates for transforming statsd buckets into influx
measurements and tags.
- **parse_data_dog_tags** boolean: Enable parsing of tags in DataDog's dogstatsd format (<http://docs.datadoghq.com/guides/dogstatsd/>)
- **datadog_extensions** boolean: Enable parsing of DataDog's extensions to dogstatsd format (<http://docs.datadoghq.com/guides/dogstatsd/>)
- **datadog_distributions** boolean: Enable parsing of the Distribution metric in DataDog's dogstatsd format (<https://docs.datadoghq.com/developers/metrics/types/?tab=distribution#definition>)
- **datadog_keep_container_tag** boolean: Keep or drop the container id as tag. Included as optional field in DogStatsD protocol v1.2 if source is running in Kubernetes.
- **max_ttl** config.Duration: Max duration (TTL) for each metric to stay cached/reported without being updated.
## Statsd bucket -> InfluxDB line-protocol Templates
The plugin supports specifying templates for transforming statsd buckets into
InfluxDB measurement names and tags. The templates have a _measurement_ keyword,
which can be used to specify parts of the bucket that are to be used in the
measurement name. Other words in the template are used as tag names. For
example, the following template:
```toml
templates = [
"measurement.measurement.region"
]
```
would result in the following transformation:
```shell
cpu.load.us-west:100|g
=> cpu_load,region=us-west 100
```
Users can also filter the template to use based on the name of the bucket,
using glob matching, like so:
```toml
templates = [
"cpu.* measurement.measurement.region",
"mem.* measurement.measurement.host"
]
```
which would result in the following transformation:
```shell
cpu.load.us-west:100|g
=> cpu_load,region=us-west 100
mem.cached.localhost:256|g
=> mem_cached,host=localhost 256
```
Consult the [Template Patterns](/docs/TEMPLATE_PATTERN.md) documentation for
additional details.
## Example Output

View file

@ -0,0 +1,179 @@
package statsd
// this is adapted from datadog's apache licensed version at
// https://github.com/DataDog/datadog-agent/blob/fcfc74f106ab1bd6991dfc6a7061c558d934158a/pkg/dogstatsd/parser.go#L173
import (
"errors"
"fmt"
"strconv"
"strings"
"time"
)
const (
priorityNormal = "normal"
priorityLow = "low"
eventInfo = "info"
eventWarning = "warning"
eventError = "error"
eventSuccess = "success"
)
var uncommenter = strings.NewReplacer("\\n", "\n")
func (s *Statsd) parseEventMessage(now time.Time, message, defaultHostname string) error {
// _e{title.length,text.length}:title|text
// [
// |d:date_happened
// |p:priority
// |h:hostname
// |t:alert_type
// |s:source_type_name
// |#tag1,tag2
// ]
//
//
// tag is key:value
messageRaw := strings.SplitN(message, ":", 2)
if len(messageRaw) < 2 || len(messageRaw[0]) < 7 || len(messageRaw[1]) < 3 {
return errors.New("invalid message format")
}
header := messageRaw[0]
message = messageRaw[1]
rawLen := strings.SplitN(header[3:], ",", 2)
if len(rawLen) != 2 {
return errors.New("invalid message format")
}
titleLen, err := strconv.ParseInt(rawLen[0], 10, 64)
if err != nil {
return fmt.Errorf("invalid message format, could not parse title.length: %q", rawLen[0])
}
if len(rawLen[1]) < 1 {
return fmt.Errorf("invalid message format, could not parse text.length: %q", rawLen[0])
}
textLen, err := strconv.ParseInt(rawLen[1][:len(rawLen[1])-1], 10, 64)
if err != nil {
return fmt.Errorf("invalid message format, could not parse text.length: %q", rawLen[0])
}
if titleLen+textLen+1 > int64(len(message)) {
return errors.New("invalid message format, title.length and text.length exceed total message length")
}
rawTitle := message[:titleLen]
rawText := message[titleLen+1 : titleLen+1+textLen]
message = message[titleLen+1+textLen:]
if len(rawTitle) == 0 || len(rawText) == 0 {
return errors.New("invalid event message format: empty 'title' or 'text' field")
}
name := rawTitle
tags := make(map[string]string, strings.Count(message, ",")+2) // allocate for the approximate number of tags
fields := make(map[string]interface{}, 9)
fields["alert_type"] = eventInfo // default event type
fields["text"] = uncommenter.Replace(rawText)
if defaultHostname != "" {
tags["source"] = defaultHostname
}
fields["priority"] = priorityNormal
ts := now
if len(message) < 2 {
s.acc.AddFields(name, fields, tags, ts)
return nil
}
rawMetadataFields := strings.Split(message[1:], "|")
for i := range rawMetadataFields {
if len(rawMetadataFields[i]) < 2 {
return errors.New("too short metadata field")
}
switch rawMetadataFields[i][:2] {
case "d:":
ts, err := strconv.ParseInt(rawMetadataFields[i][2:], 10, 64)
if err != nil {
continue
}
fields["ts"] = ts
case "p:":
switch rawMetadataFields[i][2:] {
case priorityLow:
fields["priority"] = priorityLow
case priorityNormal: // we already used this as a default
default:
continue
}
case "h:":
tags["source"] = rawMetadataFields[i][2:]
case "t:":
switch rawMetadataFields[i][2:] {
case eventError, eventWarning, eventSuccess, eventInfo:
fields["alert_type"] = rawMetadataFields[i][2:] // already set for info
default:
continue
}
case "k:":
tags["aggregation_key"] = rawMetadataFields[i][2:]
case "s:":
fields["source_type_name"] = rawMetadataFields[i][2:]
default:
if rawMetadataFields[i][0] != '#' {
return fmt.Errorf("unknown metadata type: %q", rawMetadataFields[i])
}
parseDataDogTags(tags, rawMetadataFields[i][1:])
}
}
// Use source tag because host is reserved tag key in Telegraf.
// In datadog the host tag and `h:` are interchangeable, so we have to check for the host tag.
if host, ok := tags["host"]; ok {
delete(tags, "host")
tags["source"] = host
}
s.acc.AddFields(name, fields, tags, ts)
return nil
}
func parseDataDogTags(tags map[string]string, message string) {
if len(message) == 0 {
return
}
start, i := 0, 0
var k string
var inVal bool // check if we are parsing the value part of the tag
for i = range message {
if message[i] == ',' {
if k == "" {
k = message[start:i]
tags[k] = "true" // this is because influx doesn't support empty tags
start = i + 1
continue
}
v := message[start:i]
if v == "" {
v = "true"
}
tags[k] = v
start = i + 1
k, inVal = "", false // reset state vars
} else if message[i] == ':' && !inVal {
k = message[start:i]
start = i + 1
inVal = true
}
}
if k == "" && start < i+1 {
tags[message[start:i+1]] = "true"
}
// grab the last value
if k != "" {
if start < i+1 {
tags[k] = message[start : i+1]
return
}
tags[k] = "true"
}
}

View file

@ -0,0 +1,485 @@
package statsd
import (
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/influxdata/telegraf/testutil"
)
func TestEventGather(t *testing.T) {
now := time.Now()
type expected struct {
title string
tags map[string]string
fields map[string]interface{}
}
tests := []struct {
name string
message string
hostname string
now time.Time
err bool
expected expected
}{{
name: "basic",
message: "_e{10,9}:test title|test text",
hostname: "default-hostname",
now: now,
err: false,
expected: expected{
title: "test title",
tags: map[string]string{"source": "default-hostname"},
fields: map[string]interface{}{
"priority": priorityNormal,
"alert_type": "info",
"text": "test text",
},
},
},
{
name: "escape some stuff",
message: "_e{10,24}:test title|test\\line1\\nline2\\nline3",
hostname: "default-hostname",
now: now.Add(1),
err: false,
expected: expected{
title: "test title",
tags: map[string]string{"source": "default-hostname"},
fields: map[string]interface{}{
"priority": priorityNormal,
"alert_type": "info",
"text": "test\\line1\nline2\nline3",
},
},
},
{
name: "custom time",
message: "_e{10,9}:test title|test text|d:21",
hostname: "default-hostname",
now: now.Add(2),
err: false,
expected: expected{
title: "test title",
tags: map[string]string{"source": "default-hostname"},
fields: map[string]interface{}{
"priority": priorityNormal,
"alert_type": "info",
"text": "test text",
"ts": int64(21),
},
},
},
}
acc := &testutil.Accumulator{}
s := newTestStatsd()
require.NoError(t, s.Start(acc))
defer s.Stop()
for i := range tests {
t.Run(tests[i].name, func(t *testing.T) {
err := s.parseEventMessage(tests[i].now, tests[i].message, tests[i].hostname)
if tests[i].err {
require.Error(t, err)
} else {
require.NoError(t, err)
}
require.Equal(t, uint64(i+1), acc.NMetrics())
require.NoError(t, err)
require.Equal(t, tests[i].expected.title, acc.Metrics[i].Measurement)
require.Equal(t, tests[i].expected.tags, acc.Metrics[i].Tags)
require.Equal(t, tests[i].expected.fields, acc.Metrics[i].Fields)
})
}
}
// These tests adapted from tests in
// https://github.com/DataDog/datadog-agent/blob/master/pkg/dogstatsd/parser_test.go
// to ensure compatibility with the datadog-agent parser
func TestEvents(t *testing.T) {
now := time.Now()
type args struct {
now time.Time
message string
hostname string
}
type expected struct {
title string
text interface{}
now time.Time
ts interface{}
priority string
source string
alertType interface{}
aggregationKey string
sourceTypeName interface{}
checkTags map[string]string
}
tests := []struct {
name string
args args
expected expected
}{
{
name: "event minimal",
args: args{
now: now,
message: "_e{10,9}:test title|test text",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now,
priority: priorityNormal,
source: "default-hostname",
alertType: eventInfo,
aggregationKey: "",
},
},
{
name: "event multilines text",
args: args{
now: now.Add(1),
message: "_e{10,24}:test title|test\\line1\\nline2\\nline3",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test\\line1\nline2\nline3",
now: now.Add(1),
priority: priorityNormal,
source: "default-hostname",
alertType: eventInfo,
aggregationKey: "",
},
},
{
name: "event pipe in title",
args: args{
now: now.Add(2),
message: "_e{10,24}:test|title|test\\line1\\nline2\\nline3",
hostname: "default-hostname",
},
expected: expected{
title: "test|title",
text: "test\\line1\nline2\nline3",
now: now.Add(2),
priority: priorityNormal,
source: "default-hostname",
alertType: eventInfo,
aggregationKey: "",
},
},
{
name: "event metadata timestamp",
args: args{
now: now.Add(3),
message: "_e{10,9}:test title|test text|d:21",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(3),
priority: priorityNormal,
source: "default-hostname",
alertType: eventInfo,
aggregationKey: "",
ts: int64(21),
},
},
{
name: "event metadata priority",
args: args{
now: now.Add(4),
message: "_e{10,9}:test title|test text|p:low",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(4),
priority: priorityLow,
source: "default-hostname",
alertType: eventInfo,
},
},
{
name: "event metadata hostname",
args: args{
now: now.Add(5),
message: "_e{10,9}:test title|test text|h:localhost",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(5),
priority: priorityNormal,
source: "localhost",
alertType: eventInfo,
},
},
{
name: "event metadata hostname in tag",
args: args{
now: now.Add(6),
message: "_e{10,9}:test title|test text|#host:localhost",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(6),
priority: priorityNormal,
source: "localhost",
alertType: eventInfo,
},
},
{
name: "event metadata empty host tag",
args: args{
now: now.Add(7),
message: "_e{10,9}:test title|test text|#host:,other:tag",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(7),
priority: priorityNormal,
source: "true",
alertType: eventInfo,
checkTags: map[string]string{"other": "tag", "source": "true"},
},
},
{
name: "event metadata alert type",
args: args{
now: now.Add(8),
message: "_e{10,9}:test title|test text|t:warning",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(8),
priority: priorityNormal,
source: "default-hostname",
alertType: eventWarning,
},
},
{
name: "event metadata aggregation key",
args: args{
now: now.Add(9),
message: "_e{10,9}:test title|test text|k:some aggregation key",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(9),
priority: priorityNormal,
source: "default-hostname",
alertType: eventInfo,
aggregationKey: "some aggregation key",
},
},
{
name: "event metadata aggregation key",
args: args{
now: now.Add(10),
message: "_e{10,9}:test title|test text|k:some aggregation key",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(10),
priority: priorityNormal,
source: "default-hostname",
alertType: eventInfo,
aggregationKey: "some aggregation key",
},
},
{
name: "event metadata source type",
args: args{
now: now.Add(11),
message: "_e{10,9}:test title|test text|s:this is the source",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(11),
priority: priorityNormal,
source: "default-hostname",
sourceTypeName: "this is the source",
alertType: eventInfo,
},
},
{
name: "event metadata source type",
args: args{
now: now.Add(11),
message: "_e{10,9}:test title|test text|s:this is the source",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(11),
priority: priorityNormal,
source: "default-hostname",
sourceTypeName: "this is the source",
alertType: eventInfo,
},
},
{
name: "event metadata source tags",
args: args{
now: now.Add(11),
message: "_e{10,9}:test title|test text|#tag1,tag2:test",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(11),
priority: priorityNormal,
source: "default-hostname",
alertType: eventInfo,
checkTags: map[string]string{"tag1": "true", "tag2": "test", "source": "default-hostname"},
},
},
{
name: "event metadata multiple",
args: args{
now: now.Add(11),
message: "_e{10,9}:test title|test text|t:warning|d:12345|p:low|h:some.host|k:aggKey|s:source test|#tag1,tag2:test",
hostname: "default-hostname",
},
expected: expected{
title: "test title",
text: "test text",
now: now.Add(11),
priority: priorityLow,
source: "some.host",
ts: int64(12345),
alertType: eventWarning,
aggregationKey: "aggKey",
sourceTypeName: "source test",
checkTags: map[string]string{"aggregation_key": "aggKey", "tag1": "true", "tag2": "test", "source": "some.host"},
},
},
}
s := newTestStatsd()
acc := &testutil.Accumulator{}
require.NoError(t, s.Start(acc))
defer s.Stop()
for i := range tests {
t.Run(tests[i].name, func(t *testing.T) {
acc.ClearMetrics()
err := s.parseEventMessage(tests[i].args.now, tests[i].args.message, tests[i].args.hostname)
require.NoError(t, err)
m := acc.Metrics[0]
require.Equal(t, tests[i].expected.title, m.Measurement)
require.Equal(t, tests[i].expected.text, m.Fields["text"])
require.Equal(t, tests[i].expected.now, m.Time)
require.Equal(t, tests[i].expected.ts, m.Fields["ts"])
require.Equal(t, tests[i].expected.priority, m.Fields["priority"])
require.Equal(t, tests[i].expected.source, m.Tags["source"])
require.Equal(t, tests[i].expected.alertType, m.Fields["alert_type"])
require.Equal(t, tests[i].expected.aggregationKey, m.Tags["aggregation_key"])
require.Equal(t, tests[i].expected.sourceTypeName, m.Fields["source_type_name"])
if tests[i].expected.checkTags != nil {
require.Equal(t, tests[i].expected.checkTags, m.Tags)
}
})
}
}
func TestEventError(t *testing.T) {
now := time.Now()
s := newTestStatsd()
acc := &testutil.Accumulator{}
require.NoError(t, s.Start(acc))
defer s.Stop()
// missing length header
err := s.parseEventMessage(now, "_e:title|text", "default-hostname")
require.Error(t, err)
// greater length than packet
err = s.parseEventMessage(now, "_e{10,10}:title|text", "default-hostname")
require.Error(t, err)
// zero length
err = s.parseEventMessage(now, "_e{0,0}:a|a", "default-hostname")
require.Error(t, err)
// missing title or text length
err = s.parseEventMessage(now, "_e{5555:title|text", "default-hostname")
require.Error(t, err)
// missing wrong len format
err = s.parseEventMessage(now, "_e{a,1}:title|text", "default-hostname")
require.Error(t, err)
err = s.parseEventMessage(now, "_e{1,a}:title|text", "default-hostname")
require.Error(t, err)
// missing title or text length
err = s.parseEventMessage(now, "_e{5,}:title|text", "default-hostname")
require.Error(t, err)
err = s.parseEventMessage(now, "_e{100,:title|text", "default-hostname")
require.Error(t, err)
err = s.parseEventMessage(now, "_e,100:title|text", "default-hostname")
require.Error(t, err)
err = s.parseEventMessage(now, "_e{,4}:title|text", "default-hostname")
require.Error(t, err)
err = s.parseEventMessage(now, "_e{}:title|text", "default-hostname")
require.Error(t, err)
err = s.parseEventMessage(now, "_e{,}:title|text", "default-hostname")
require.Error(t, err)
// not enough information
err = s.parseEventMessage(now, "_e|text", "default-hostname")
require.Error(t, err)
err = s.parseEventMessage(now, "_e:|text", "default-hostname")
require.Error(t, err)
// invalid timestamp
err = s.parseEventMessage(now, "_e{5,4}:title|text|d:abc", "default-hostname")
require.NoError(t, err)
// invalid priority
err = s.parseEventMessage(now, "_e{5,4}:title|text|p:urgent", "default-hostname")
require.NoError(t, err)
// invalid priority
err = s.parseEventMessage(now, "_e{5,4}:title|text|p:urgent", "default-hostname")
require.NoError(t, err)
// invalid alert type
err = s.parseEventMessage(now, "_e{5,4}:title|text|t:test", "default-hostname")
require.NoError(t, err)
// unknown metadata
err = s.parseEventMessage(now, "_e{5,4}:title|text|x:1234", "default-hostname")
require.Error(t, err)
}

View file

@ -0,0 +1,150 @@
package statsd
import (
"math"
"math/rand"
"sort"
)
const defaultPercentileLimit = 1000
const defaultMedianLimit = 1000
// runningStats calculates a running mean, variance, standard deviation,
// lower bound, upper bound, count, and can calculate estimated percentiles.
// It is based on the incremental algorithm described here:
//
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
type runningStats struct {
k float64
n int64
ex float64
ex2 float64
// Array used to calculate estimated percentiles
// We will store a maximum of percLimit values, at which point we will start
// randomly replacing old values, hence it is an estimated percentile.
perc []float64
percLimit int
totalSum float64
lowerBound float64
upperBound float64
// cache if we have sorted the list so that we never re-sort a sorted list,
// which can have very bad performance.
sortedPerc bool
// Array used to calculate estimated median values
// We will store a maximum of medLimit values, at which point we will start
// slicing old values
med []float64
medLimit int
medInsertIndex int
}
func (rs *runningStats) addValue(v float64) {
// Whenever a value is added, the list is no longer sorted.
rs.sortedPerc = false
if rs.n == 0 {
rs.k = v
rs.upperBound = v
rs.lowerBound = v
if rs.percLimit == 0 {
rs.percLimit = defaultPercentileLimit
}
if rs.medLimit == 0 {
rs.medLimit = defaultMedianLimit
rs.medInsertIndex = 0
}
rs.perc = make([]float64, 0, rs.percLimit)
rs.med = make([]float64, 0, rs.medLimit)
}
// These are used for the running mean and variance
rs.n++
rs.ex += v - rs.k
rs.ex2 += (v - rs.k) * (v - rs.k)
// add to running sum
rs.totalSum += v
// track upper and lower bounds
if v > rs.upperBound {
rs.upperBound = v
} else if v < rs.lowerBound {
rs.lowerBound = v
}
if len(rs.perc) < rs.percLimit {
rs.perc = append(rs.perc, v)
} else {
// Reached limit, choose random index to overwrite in the percentile array
rs.perc[rand.Intn(len(rs.perc))] = v //nolint:gosec // G404: not security critical
}
if len(rs.med) < rs.medLimit {
rs.med = append(rs.med, v)
} else {
// Reached limit, start over
rs.med[rs.medInsertIndex] = v
}
rs.medInsertIndex = (rs.medInsertIndex + 1) % rs.medLimit
}
func (rs *runningStats) mean() float64 {
return rs.k + rs.ex/float64(rs.n)
}
func (rs *runningStats) median() float64 {
// Need to sort for median, but keep temporal order
var values []float64
values = append(values, rs.med...)
sort.Float64s(values)
count := len(values)
if count == 0 {
return 0
} else if count%2 == 0 {
return (values[count/2-1] + values[count/2]) / 2
}
return values[count/2]
}
func (rs *runningStats) variance() float64 {
return (rs.ex2 - (rs.ex*rs.ex)/float64(rs.n)) / float64(rs.n)
}
func (rs *runningStats) stddev() float64 {
return math.Sqrt(rs.variance())
}
func (rs *runningStats) sum() float64 {
return rs.totalSum
}
func (rs *runningStats) upper() float64 {
return rs.upperBound
}
func (rs *runningStats) lower() float64 {
return rs.lowerBound
}
func (rs *runningStats) count() int64 {
return rs.n
}
func (rs *runningStats) percentile(n float64) float64 {
if n > 100 {
n = 100
}
if !rs.sortedPerc {
sort.Float64s(rs.perc)
rs.sortedPerc = true
}
i := float64(len(rs.perc)) * n / float64(100)
return rs.perc[max(0, min(int(i), len(rs.perc)-1))]
}

View file

@ -0,0 +1,196 @@
package statsd
import (
"math"
"testing"
)
// Test that a single metric is handled correctly
func TestRunningStats_Single(t *testing.T) {
rs := runningStats{}
values := []float64{10.1}
for _, v := range values {
rs.addValue(v)
}
if rs.mean() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.mean())
}
if rs.median() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.median())
}
if rs.upper() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.upper())
}
if rs.lower() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.lower())
}
if rs.percentile(100) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(100))
}
if rs.percentile(99.95) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(99.95))
}
if rs.percentile(90) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(90))
}
if rs.percentile(50) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(50))
}
if rs.percentile(0) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(0))
}
if rs.count() != 1 {
t.Errorf("Expected %v, got %v", 1, rs.count())
}
if rs.variance() != 0 {
t.Errorf("Expected %v, got %v", 0, rs.variance())
}
if rs.stddev() != 0 {
t.Errorf("Expected %v, got %v", 0, rs.stddev())
}
}
// Test that duplicate values are handled correctly
func TestRunningStats_Duplicate(t *testing.T) {
rs := runningStats{}
values := []float64{10.1, 10.1, 10.1, 10.1}
for _, v := range values {
rs.addValue(v)
}
if rs.mean() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.mean())
}
if rs.median() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.median())
}
if rs.upper() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.upper())
}
if rs.lower() != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.lower())
}
if rs.percentile(100) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(100))
}
if rs.percentile(99.95) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(99.95))
}
if rs.percentile(90) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(90))
}
if rs.percentile(50) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(50))
}
if rs.percentile(0) != 10.1 {
t.Errorf("Expected %v, got %v", 10.1, rs.percentile(0))
}
if rs.count() != 4 {
t.Errorf("Expected %v, got %v", 4, rs.count())
}
if rs.variance() != 0 {
t.Errorf("Expected %v, got %v", 0, rs.variance())
}
if rs.stddev() != 0 {
t.Errorf("Expected %v, got %v", 0, rs.stddev())
}
}
// Test a list of sample values, returns all correct values
func TestRunningStats(t *testing.T) {
rs := runningStats{}
values := []float64{10, 20, 10, 30, 20, 11, 12, 32, 45, 9, 5, 5, 5, 10, 23, 8}
for _, v := range values {
rs.addValue(v)
}
if rs.mean() != 15.9375 {
t.Errorf("Expected %v, got %v", 15.9375, rs.mean())
}
if rs.median() != 10.5 {
t.Errorf("Expected %v, got %v", 10.5, rs.median())
}
if rs.upper() != 45 {
t.Errorf("Expected %v, got %v", 45, rs.upper())
}
if rs.lower() != 5 {
t.Errorf("Expected %v, got %v", 5, rs.lower())
}
if rs.percentile(100) != 45 {
t.Errorf("Expected %v, got %v", 45, rs.percentile(100))
}
if rs.percentile(99.98) != 45 {
t.Errorf("Expected %v, got %v", 45, rs.percentile(99.98))
}
if rs.percentile(90) != 32 {
t.Errorf("Expected %v, got %v", 32, rs.percentile(90))
}
if rs.percentile(50.1) != 11 {
t.Errorf("Expected %v, got %v", 11, rs.percentile(50.1))
}
if rs.percentile(50) != 11 {
t.Errorf("Expected %v, got %v", 11, rs.percentile(50))
}
if rs.percentile(49.9) != 10 {
t.Errorf("Expected %v, got %v", 10, rs.percentile(49.9))
}
if rs.percentile(0) != 5 {
t.Errorf("Expected %v, got %v", 5, rs.percentile(0))
}
if rs.count() != 16 {
t.Errorf("Expected %v, got %v", 4, rs.count())
}
if !fuzzyEqual(rs.variance(), 124.93359, .00001) {
t.Errorf("Expected %v, got %v", 124.93359, rs.variance())
}
if !fuzzyEqual(rs.stddev(), 11.17736, .00001) {
t.Errorf("Expected %v, got %v", 11.17736, rs.stddev())
}
}
// Test that the percentile limit is respected.
func TestRunningStats_PercentileLimit(t *testing.T) {
rs := runningStats{}
rs.percLimit = 10
values := []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
for _, v := range values {
rs.addValue(v)
}
if rs.count() != 11 {
t.Errorf("Expected %v, got %v", 11, rs.count())
}
if len(rs.perc) != 10 {
t.Errorf("Expected %v, got %v", 10, len(rs.perc))
}
}
func fuzzyEqual(a, b, epsilon float64) bool {
return math.Abs(a-b) <= epsilon
}
// Test that the median limit is respected and medInsertIndex is properly incrementing index.
func TestRunningStats_MedianLimitIndex(t *testing.T) {
rs := runningStats{}
rs.medLimit = 10
values := []float64{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
for _, v := range values {
rs.addValue(v)
}
if rs.count() != 11 {
t.Errorf("Expected %v, got %v", 11, rs.count())
}
if len(rs.med) != 10 {
t.Errorf("Expected %v, got %v", 10, len(rs.med))
}
if rs.medInsertIndex != 1 {
t.Errorf("Expected %v, got %v", 0, rs.medInsertIndex)
}
}

View file

@ -0,0 +1,104 @@
# Statsd Server
[[inputs.statsd]]
## Protocol, must be "tcp", "udp4", "udp6" or "udp" (default=udp)
protocol = "udp"
## MaxTCPConnection - applicable when protocol is set to tcp (default=250)
max_tcp_connections = 250
## Enable TCP keep alive probes (default=false)
tcp_keep_alive = false
## Specifies the keep-alive period for an active network connection.
## Only applies to TCP sockets and will be ignored if tcp_keep_alive is false.
## Defaults to the OS configuration.
# tcp_keep_alive_period = "2h"
## Address and port to host UDP listener on
service_address = ":8125"
## The following configuration options control when telegraf clears it's cache
## of previous values. If set to false, then telegraf will only clear it's
## cache when the daemon is restarted.
## Reset gauges every interval (default=true)
delete_gauges = true
## Reset counters every interval (default=true)
delete_counters = true
## Reset sets every interval (default=true)
delete_sets = true
## Reset timings & histograms every interval (default=true)
delete_timings = true
## Enable aggregation temporality adds temporality=delta or temporality=commulative tag, and
## start_time field, which adds the start time of the metric accumulation.
## You should use this when using OpenTelemetry output.
# enable_aggregation_temporality = false
## Percentiles to calculate for timing & histogram stats.
percentiles = [50.0, 90.0, 99.0, 99.9, 99.95, 100.0]
## separator to use between elements of a statsd metric
metric_separator = "_"
## Parses extensions to statsd in the datadog statsd format
## currently supports metrics and datadog tags.
## http://docs.datadoghq.com/guides/dogstatsd/
datadog_extensions = false
## Parses distributions metric as specified in the datadog statsd format
## https://docs.datadoghq.com/developers/metrics/types/?tab=distribution#definition
datadog_distributions = false
## Keep or drop the container id as tag. Included as optional field
## in DogStatsD protocol v1.2 if source is running in Kubernetes
## https://docs.datadoghq.com/developers/dogstatsd/datagram_shell/?tab=metrics#dogstatsd-protocol-v12
datadog_keep_container_tag = false
## Statsd data translation templates, more info can be read here:
## https://github.com/influxdata/telegraf/blob/master/docs/TEMPLATE_PATTERN.md
# templates = [
# "cpu.* measurement*"
# ]
## Number of UDP messages allowed to queue up, once filled,
## the statsd server will start dropping packets
allowed_pending_messages = 10000
## Number of worker threads used to parse the incoming messages.
# number_workers_threads = 5
## Number of timing/histogram values to track per-measurement in the
## calculation of percentiles. Raising this limit increases the accuracy
## of percentiles but also increases the memory usage and cpu time.
percentile_limit = 1000
## Maximum socket buffer size in bytes, once the buffer fills up, metrics
## will start dropping. Defaults to the OS default.
# read_buffer_size = 65535
## Max duration (TTL) for each metric to stay cached/reported without being updated.
# max_ttl = "10h"
## Sanitize name method
## By default, telegraf will pass names directly as they are received.
## However, upstream statsd now does sanitization of names which can be
## enabled by using the "upstream" method option. This option will a) replace
## white space with '_', replace '/' with '-', and remove characters not
## matching 'a-zA-Z_\-0-9\.;='.
#sanitize_name_method = ""
## Replace dots (.) with underscore (_) and dashes (-) with
## double underscore (__) in metric names.
# convert_names = false
## Convert all numeric counters to float
## Enabling this would ensure that both counters and guages are both emitted
## as floats.
# float_counters = false
## Emit timings `metric_<name>_count` field as float, the same as all other
## histogram fields
# float_timings = false
## Emit sets as float
# float_sets = false

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff