1
0
Fork 0

Adding upstream version 1.34.4.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-24 07:26:29 +02:00
parent e393c3af3f
commit 4978089aab
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
4963 changed files with 677545 additions and 0 deletions

View file

@ -0,0 +1,229 @@
# CSV Parser Plugin
The `csv` parser creates metrics from a document containing comma separated
values.
## Configuration
```toml
[[inputs.file]]
files = ["example"]
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "csv"
## Indicates how many rows to treat as a header. By default, the parser assumes
## there is no header and will parse the first row as data. If set to anything more
## than 1, column names will be concatenated with the name listed in the next header row.
## If `csv_column_names` is specified, the column names in header will be overridden.
csv_header_row_count = 0
## For assigning custom names to columns
## If this is specified, all columns should have a name
## Unnamed columns will be ignored by the parser.
## If `csv_header_row_count` is set to 0, this config must be used
csv_column_names = []
## For assigning explicit data types to columns.
## Supported types: "int", "float", "bool", "string".
## Specify types in order by column (e.g. `["string", "int", "float"]`)
## If this is not specified, type conversion will be done on the types above.
csv_column_types = []
## Indicates the number of rows to skip before looking for metadata and header information.
csv_skip_rows = 0
## Indicates the number of rows to parse as metadata before looking for header information.
## By default, the parser assumes there are no metadata rows to parse.
## If set, the parser would use the provided separators in the csv_metadata_separators to look for metadata.
## Please note that by default, the (key, value) pairs will be added as tags.
## If fields are required, use the converter processor.
csv_metadata_rows = 0
## A list of metadata separators. If csv_metadata_rows is set,
## csv_metadata_separators must contain at least one separator.
## Please note that separators are case sensitive and the sequence of the separators are respected.
csv_metadata_separators = [":", "="]
## A set of metadata trim characters.
## If csv_metadata_trim_set is not set, no trimming is performed.
## Please note that the trim cutset is case sensitive.
csv_metadata_trim_set = ""
## Indicates the number of columns to skip before looking for data to parse.
## These columns will be skipped in the header as well.
csv_skip_columns = 0
## The separator between csv fields
## By default, the parser assumes a comma (",")
## Please note that if you use invalid delimiters (e.g. "\u0000"), commas
## will be changed to "\ufffd", the invalid delimiters changed to a comma
## during parsing, and afterwards the invalid characters and commas are
## returned to their original values.
csv_delimiter = ","
## The character reserved for marking a row as a comment row
## Commented rows are skipped and not parsed
csv_comment = ""
## If set to true, the parser will remove leading whitespace from fields
## By default, this is false
csv_trim_space = false
## Columns listed here will be added as tags. Any other columns
## will be added as fields.
csv_tag_columns = []
## Set to true to let the column tags overwrite the metadata and default tags.
csv_tag_overwrite = false
## The column to extract the name of the metric from. Will not be
## included as field in metric.
csv_measurement_column = ""
## The column to extract time information for the metric
## `csv_timestamp_format` must be specified if this is used.
## Will not be included as field in metric.
csv_timestamp_column = ""
## The format of time data extracted from `csv_timestamp_column`
## this must be specified if `csv_timestamp_column` is specified
csv_timestamp_format = ""
## The timezone of time data extracted from `csv_timestamp_column`
## in case of there is no timezone information.
## It follows the IANA Time Zone database.
csv_timezone = ""
## Indicates values to skip, such as an empty string value "".
## The field will be skipped entirely where it matches any values inserted here.
csv_skip_values = []
## If set to true, the parser will skip csv lines that cannot be parsed.
## By default, this is false
csv_skip_errors = false
## Reset the parser on given conditions.
## This option can be used to reset the parser's state e.g. when always reading a
## full CSV structure including header etc. Available modes are
## "none" -- do not reset the parser (default)
## "always" -- reset the parser with each call (ignored in line-wise parsing)
## Helpful when e.g. reading whole files in each gather-cycle.
# csv_reset_mode = "none"
```
### csv_timestamp_column, csv_timestamp_format
By default, the current time will be used for all created metrics, to set the
time using the JSON document you can use the `csv_timestamp_column` and
`csv_timestamp_format` options together to set the time to a value in the parsed
document.
The `csv_timestamp_column` option specifies the key containing the time value
and `csv_timestamp_format` must be set to `unix`, `unix_ms`, `unix_us`,
`unix_ns`, or a format string in using the Go "reference time" which is defined
to be the **specific time**: `Mon Jan 2 15:04:05 MST 2006`.
Consult the Go [time][time parse] package for details and additional examples
on how to set the time format.
## Metrics
One metric is created for each row with the columns added as fields. The type
of the field is automatically determined based on the contents of the value.
In addition to the options above, you can use [metric filtering][] to skip over
columns and rows.
## Examples
Config:
```toml
[[inputs.file]]
files = ["example"]
data_format = "csv"
csv_header_row_count = 1
csv_timestamp_column = "time"
csv_timestamp_format = "2006-01-02T15:04:05Z07:00"
```
Input:
```csv
measurement,cpu,time_user,time_system,time_idle,time
cpu,cpu0,42,42,42,2018-09-13T13:03:28Z
```
Output:
```text
cpu cpu=cpu0,time_user=42,time_system=42,time_idle=42 1536869008000000000
```
Config:
```toml
[[inputs.file]]
files = ["example"]
data_format = "csv"
csv_metadata_rows = 2
csv_metadata_separators = [":", "="]
csv_metadata_trim_set = " #"
csv_header_row_count = 1
csv_tag_columns = ["Version","cpu"]
csv_timestamp_column = "time"
csv_timestamp_format = "2006-01-02T15:04:05Z07:00"
```
Input:
```csv
# Version=1.1
# File Created: 2021-11-17T07:02:45+10:00
Version,measurement,cpu,time_user,time_system,time_idle,time
1.2,cpu,cpu0,42,42,42,2018-09-13T13:03:28Z
```
Output:
```text
cpu,cpu=cpu0,File\ Created=2021-11-17T07:02:45+10:00,Version=1.1 time_user=42,time_system=42,time_idle=42 1536869008000000000
```
Config:
```toml
[[inputs.file]]
files = ["example"]
data_format = "csv"
csv_metadata_rows = 2
csv_metadata_separators = [":", "="]
csv_metadata_trim_set = " #"
csv_header_row_count = 1
csv_tag_columns = ["Version","cpu"]
csv_tag_overwrite = true
csv_timestamp_column = "time"
csv_timestamp_format = "2006-01-02T15:04:05Z07:00"
```
Input:
```csv
# Version=1.1
# File Created: 2021-11-17T07:02:45+10:00
Version,measurement,cpu,time_user,time_system,time_idle,time
1.2,cpu,cpu0,42,42,42,2018-09-13T13:03:28Z
```
Output:
```text
cpu,cpu=cpu0,File\ Created=2021-11-17T07:02:45+10:00,Version=1.2 time_user=42,time_system=42,time_idle=42 1536869008000000000
```
[time parse]: https://pkg.go.dev/time#Parse
[metric filtering]: /docs/CONFIGURATION.md#metric-filtering

View file

@ -0,0 +1,506 @@
package csv
import (
"bufio"
"bytes"
"encoding/csv"
"errors"
"fmt"
"io"
"sort"
"strconv"
"strings"
"time"
_ "time/tzdata" // needed to bundle timezone info into the binary for Windows
"unicode/utf8"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/internal/choice"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/plugins/parsers"
)
type TimeFunc func() time.Time
const replacementByte = "\ufffd"
const commaByte = "\u002C"
type Parser struct {
ColumnNames []string `toml:"csv_column_names"`
ColumnTypes []string `toml:"csv_column_types"`
Comment string `toml:"csv_comment"`
Delimiter string `toml:"csv_delimiter"`
HeaderRowCount int `toml:"csv_header_row_count"`
MeasurementColumn string `toml:"csv_measurement_column"`
MetricName string `toml:"metric_name"`
SkipColumns int `toml:"csv_skip_columns"`
SkipRows int `toml:"csv_skip_rows"`
TagColumns []string `toml:"csv_tag_columns"`
TagOverwrite bool `toml:"csv_tag_overwrite"`
TimestampColumn string `toml:"csv_timestamp_column"`
TimestampFormat string `toml:"csv_timestamp_format"`
Timezone string `toml:"csv_timezone"`
TrimSpace bool `toml:"csv_trim_space"`
SkipValues []string `toml:"csv_skip_values"`
SkipErrors bool `toml:"csv_skip_errors"`
MetadataRows int `toml:"csv_metadata_rows"`
MetadataSeparators []string `toml:"csv_metadata_separators"`
MetadataTrimSet string `toml:"csv_metadata_trim_set"`
ResetMode string `toml:"csv_reset_mode"`
Log telegraf.Logger `toml:"-"`
metadataSeparatorList metadataPattern
location *time.Location
gotColumnNames bool
invalidDelimiter bool
TimeFunc func() time.Time
DefaultTags map[string]string
metadataTags map[string]string
gotInitialColumnNames bool
remainingSkipRows int
remainingHeaderRows int
remainingMetadataRows int
}
type metadataPattern []string
func (record metadataPattern) Len() int {
return len(record)
}
func (record metadataPattern) Swap(i, j int) {
record[i], record[j] = record[j], record[i]
}
func (record metadataPattern) Less(i, j int) bool {
// Metadata with longer lengths should be ordered before shorter metadata
return len(record[i]) > len(record[j])
}
func (p *Parser) initializeMetadataSeparators() error {
// initialize metadata
p.metadataTags = make(map[string]string)
if p.MetadataRows <= 0 {
return nil
}
if len(p.MetadataSeparators) == 0 {
return errors.New("csv_metadata_separators required when specifying csv_metadata_rows")
}
p.metadataSeparatorList = make(metadataPattern, 0, len(p.MetadataSeparators))
patternList := make(map[string]bool, len(p.MetadataSeparators))
for _, pattern := range p.MetadataSeparators {
if patternList[pattern] {
// Ignore further, duplicated entries
continue
}
patternList[pattern] = true
p.metadataSeparatorList = append(p.metadataSeparatorList, pattern)
}
sort.Stable(p.metadataSeparatorList)
return nil
}
func (p *Parser) parseMetadataRow(haystack string) map[string]string {
haystack = strings.TrimRight(haystack, "\r\n")
for _, needle := range p.metadataSeparatorList {
metadata := strings.SplitN(haystack, needle, 2)
if len(metadata) < 2 {
continue
}
key := strings.Trim(metadata[0], p.MetadataTrimSet)
if len(key) > 0 {
value := strings.Trim(metadata[1], p.MetadataTrimSet)
return map[string]string{key: value}
}
}
return nil
}
func (p *Parser) Reset() {
// Reset the columns if they were not user-specified
p.gotColumnNames = p.gotInitialColumnNames
if !p.gotInitialColumnNames {
p.ColumnNames = nil
}
// Reset the internal counters
p.remainingSkipRows = p.SkipRows
p.remainingHeaderRows = p.HeaderRowCount
p.remainingMetadataRows = p.MetadataRows
}
func (p *Parser) Init() error {
if p.HeaderRowCount == 0 && len(p.ColumnNames) == 0 {
return errors.New("`csv_header_row_count` must be defined if `csv_column_names` is not specified")
}
if p.Delimiter != "" {
runeStr := []rune(p.Delimiter)
if len(runeStr) > 1 {
return fmt.Errorf("csv_delimiter must be a single character, got: %s", p.Delimiter)
}
p.invalidDelimiter = !validDelim(runeStr[0])
}
if p.Comment != "" {
runeStr := []rune(p.Comment)
if len(runeStr) > 1 {
return fmt.Errorf("csv_delimiter must be a single character, got: %s", p.Comment)
}
}
p.gotInitialColumnNames = len(p.ColumnNames) > 0
if len(p.ColumnNames) > 0 && len(p.ColumnTypes) > 0 && len(p.ColumnNames) != len(p.ColumnTypes) {
return errors.New("csv_column_names field count doesn't match with csv_column_types")
}
if err := p.initializeMetadataSeparators(); err != nil {
return fmt.Errorf("initializing separators failed: %w", err)
}
if p.TimeFunc == nil {
p.TimeFunc = time.Now
}
if p.Timezone != "" {
loc, err := time.LoadLocation(p.Timezone)
if err != nil {
return fmt.Errorf("invalid timezone: %w", err)
}
p.location = loc
}
if p.ResetMode == "" {
p.ResetMode = "none"
}
if !choice.Contains(p.ResetMode, []string{"none", "always"}) {
return fmt.Errorf("unknown reset mode %q", p.ResetMode)
}
p.Reset()
return nil
}
func (p *Parser) SetTimeFunc(fn TimeFunc) {
p.TimeFunc = fn
}
func (p *Parser) compile(r io.Reader) *csv.Reader {
csvReader := csv.NewReader(r)
// ensures that the reader reads records of different lengths without an error
csvReader.FieldsPerRecord = -1
if !p.invalidDelimiter && p.Delimiter != "" {
csvReader.Comma, _ = utf8.DecodeRuneInString(p.Delimiter)
}
// Check if delimiter is invalid
if p.invalidDelimiter && p.Delimiter != "" {
csvReader.Comma, _ = utf8.DecodeRuneInString(commaByte)
}
if p.Comment != "" {
csvReader.Comment, _ = utf8.DecodeRuneInString(p.Comment)
}
csvReader.TrimLeadingSpace = p.TrimSpace
return csvReader
}
// Taken from upstream Golang code see
// https://github.com/golang/go/blob/release-branch.go1.19/src/encoding/csv/reader.go#L95
func validDelim(r rune) bool {
return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
}
func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
// Reset the parser according to the specified mode
if p.ResetMode == "always" {
p.Reset()
}
// If using an invalid delimiter, replace commas with replacement and
// invalid delimiter with commas
if p.invalidDelimiter {
buf = bytes.Replace(buf, []byte(commaByte), []byte(replacementByte), -1)
buf = bytes.Replace(buf, []byte(p.Delimiter), []byte(commaByte), -1)
}
r := bytes.NewReader(buf)
metrics, err := parseCSV(p, r)
if err != nil && errors.Is(err, io.EOF) {
return nil, parsers.ErrEOF
}
return metrics, err
}
func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
if len(line) == 0 {
if p.remainingSkipRows > 0 {
p.remainingSkipRows--
return nil, parsers.ErrEOF
}
if p.remainingMetadataRows > 0 {
p.remainingMetadataRows--
return nil, parsers.ErrEOF
}
}
r := bytes.NewReader([]byte(line))
metrics, err := parseCSV(p, r)
if err != nil {
if errors.Is(err, io.EOF) {
return nil, parsers.ErrEOF
}
return nil, err
}
if len(metrics) == 1 {
return metrics[0], nil
}
if len(metrics) > 1 {
return nil, fmt.Errorf("expected 1 metric found %d", len(metrics))
}
return nil, nil
}
func parseCSV(p *Parser, r io.Reader) ([]telegraf.Metric, error) {
lineReader := bufio.NewReader(r)
// skip first rows
for p.remainingSkipRows > 0 {
line, err := lineReader.ReadString('\n')
if err != nil && len(line) == 0 {
return nil, err
}
p.remainingSkipRows--
}
// Parse metadata
for p.remainingMetadataRows > 0 {
line, err := lineReader.ReadString('\n')
if err != nil && len(line) == 0 {
return nil, err
}
p.remainingMetadataRows--
m := p.parseMetadataRow(line)
for k, v := range m {
p.metadataTags[k] = v
}
}
csvReader := p.compile(lineReader)
// if there is a header, and we did not get DataColumns
// set DataColumns to names extracted from the header
// we always reread the header to avoid side effects
// in cases where multiple files with different
// headers are read
for p.remainingHeaderRows > 0 {
header, err := csvReader.Read()
if err != nil {
return nil, err
}
p.remainingHeaderRows--
if p.gotColumnNames {
// Ignore header lines if columns are named
continue
}
// concatenate header names
for i, name := range header {
if p.TrimSpace {
name = strings.Trim(name, " ")
}
if len(p.ColumnNames) <= i {
p.ColumnNames = append(p.ColumnNames, name)
} else {
p.ColumnNames[i] = p.ColumnNames[i] + name
}
}
}
if !p.gotColumnNames {
// skip first rows
p.ColumnNames = p.ColumnNames[p.SkipColumns:]
p.gotColumnNames = true
}
table, err := csvReader.ReadAll()
if err != nil {
return nil, err
}
metrics := make([]telegraf.Metric, 0)
for _, record := range table {
m, err := p.parseRecord(record)
if err != nil {
if p.SkipErrors {
p.Log.Debugf("Parsing error: %v", err)
continue
}
return metrics, err
}
metrics = append(metrics, m)
}
return metrics, nil
}
func (p *Parser) parseRecord(record []string) (telegraf.Metric, error) {
recordFields := make(map[string]interface{})
tags := make(map[string]string)
if p.TagOverwrite {
// add default tags
for k, v := range p.DefaultTags {
tags[k] = v
}
// add metadata tags
for k, v := range p.metadataTags {
tags[k] = v
}
}
// skip columns in record
record = record[p.SkipColumns:]
outer:
for i, fieldName := range p.ColumnNames {
if i < len(record) {
value := record[i]
if p.TrimSpace {
value = strings.Trim(value, " ")
}
// don't record fields where the value matches a skip value
for _, s := range p.SkipValues {
if value == s {
continue outer
}
}
for _, tagName := range p.TagColumns {
if tagName == fieldName {
tags[tagName] = value
continue outer
}
}
// If the field name is the timestamp column, then keep field name as is.
if fieldName == p.TimestampColumn {
recordFields[fieldName] = value
continue
}
// Try explicit conversion only when column types is defined.
if len(p.ColumnTypes) > 0 {
// Throw error if current column count exceeds defined types.
if i >= len(p.ColumnTypes) {
return nil, errors.New("column type: column count exceeded")
}
var val interface{}
var err error
switch p.ColumnTypes[i] {
case "int":
val, err = strconv.ParseInt(value, 10, 64)
if err != nil {
return nil, fmt.Errorf("column type: parse int error %w", err)
}
case "float":
val, err = strconv.ParseFloat(value, 64)
if err != nil {
return nil, fmt.Errorf("column type: parse float error %w", err)
}
case "bool":
val, err = strconv.ParseBool(value)
if err != nil {
return nil, fmt.Errorf("column type: parse bool error %w", err)
}
default:
val = value
}
recordFields[fieldName] = val
continue
}
// attempt type conversions
if iValue, err := strconv.ParseInt(value, 10, 64); err == nil {
recordFields[fieldName] = iValue
} else if fValue, err := strconv.ParseFloat(value, 64); err == nil {
recordFields[fieldName] = fValue
} else if bValue, err := strconv.ParseBool(value); err == nil {
recordFields[fieldName] = bValue
} else {
recordFields[fieldName] = value
}
}
}
if !p.TagOverwrite {
// add metadata tags
for k, v := range p.metadataTags {
tags[k] = v
}
// add default tags
for k, v := range p.DefaultTags {
tags[k] = v
}
}
// will default to plugin name
measurementName := p.MetricName
if p.MeasurementColumn != "" {
if recordFields[p.MeasurementColumn] != nil && recordFields[p.MeasurementColumn] != "" {
measurementName = fmt.Sprintf("%v", recordFields[p.MeasurementColumn])
}
}
metricTime, err := parseTimestamp(p.TimeFunc, recordFields, p.TimestampColumn, p.TimestampFormat, p.location)
if err != nil {
return nil, err
}
// Exclude `TimestampColumn` and `MeasurementColumn`
delete(recordFields, p.TimestampColumn)
delete(recordFields, p.MeasurementColumn)
m := metric.New(measurementName, tags, recordFields, metricTime)
return m, nil
}
// ParseTimestamp return a timestamp, if there is no timestamp on the csv it
// will be the current timestamp, else it will try to parse the time according
// to the format.
func parseTimestamp(timeFunc func() time.Time, recordFields map[string]interface{},
timestampColumn, timestampFormat string, timezone *time.Location,
) (time.Time, error) {
if timestampColumn != "" {
if recordFields[timestampColumn] == nil {
return time.Time{}, fmt.Errorf("timestamp column: %v could not be found", timestampColumn)
}
switch timestampFormat {
case "":
return time.Time{}, errors.New("timestamp format must be specified")
default:
metricTime, err := internal.ParseTimestamp(timestampFormat, recordFields[timestampColumn], timezone)
if err != nil {
return time.Time{}, err
}
return metricTime, err
}
}
return timeFunc(), nil
}
// SetDefaultTags set the DefaultTags
func (p *Parser) SetDefaultTags(tags map[string]string) {
p.DefaultTags = tags
}
func init() {
parsers.Add("csv",
func(defaultMetricName string) telegraf.Parser {
return &Parser{MetricName: defaultMetricName}
})
}

File diff suppressed because it is too large Load diff