telegraf/plugins/parsers/csv/parser.go

package csv

import (
	"bufio"
	"bytes"
	"encoding/csv"
	"errors"
	"fmt"
	"io"
	"sort"
	"strconv"
	"strings"
	"time"
	_ "time/tzdata" // needed to bundle timezone info into the binary for Windows
	"unicode/utf8"

	"github.com/influxdata/telegraf"
	"github.com/influxdata/telegraf/internal"
	"github.com/influxdata/telegraf/internal/choice"
	"github.com/influxdata/telegraf/metric"
	"github.com/influxdata/telegraf/plugins/parsers"
)

type TimeFunc func() time.Time

const replacementByte = "\ufffd"
const commaByte = "\u002C"

type Parser struct {
	ColumnNames        []string        `toml:"csv_column_names"`
	ColumnTypes        []string        `toml:"csv_column_types"`
	Comment            string          `toml:"csv_comment"`
	Delimiter          string          `toml:"csv_delimiter"`
	HeaderRowCount     int             `toml:"csv_header_row_count"`
	MeasurementColumn  string          `toml:"csv_measurement_column"`
	MetricName         string          `toml:"metric_name"`
	SkipColumns        int             `toml:"csv_skip_columns"`
	SkipRows           int             `toml:"csv_skip_rows"`
	TagColumns         []string        `toml:"csv_tag_columns"`
	TagOverwrite       bool            `toml:"csv_tag_overwrite"`
	TimestampColumn    string          `toml:"csv_timestamp_column"`
	TimestampFormat    string          `toml:"csv_timestamp_format"`
	Timezone           string          `toml:"csv_timezone"`
	TrimSpace          bool            `toml:"csv_trim_space"`
	SkipValues         []string        `toml:"csv_skip_values"`
	SkipErrors         bool            `toml:"csv_skip_errors"`
	MetadataRows       int             `toml:"csv_metadata_rows"`
	MetadataSeparators []string        `toml:"csv_metadata_separators"`
	MetadataTrimSet    string          `toml:"csv_metadata_trim_set"`
	ResetMode          string          `toml:"csv_reset_mode"`
	Log                telegraf.Logger `toml:"-"`

	metadataSeparatorList metadataPattern
	location              *time.Location

	gotColumnNames bool

	invalidDelimiter bool

	TimeFunc     func() time.Time
	DefaultTags  map[string]string
	metadataTags map[string]string

	gotInitialColumnNames bool
	remainingSkipRows     int
	remainingHeaderRows   int
	remainingMetadataRows int
}

type metadataPattern []string

func (record metadataPattern) Len() int {
	return len(record)
}
func (record metadataPattern) Swap(i, j int) {
	record[i], record[j] = record[j], record[i]
}
func (record metadataPattern) Less(i, j int) bool {
	// Metadata with longer lengths should be ordered before shorter metadata
	return len(record[i]) > len(record[j])
}

func (p *Parser) initializeMetadataSeparators() error {
	// initialize metadata
	p.metadataTags = make(map[string]string)

	if p.MetadataRows <= 0 {
		return nil
	}

	if len(p.MetadataSeparators) == 0 {
		return errors.New("csv_metadata_separators required when specifying csv_metadata_rows")
	}

	p.metadataSeparatorList = make(metadataPattern, 0, len(p.MetadataSeparators))
	patternList := make(map[string]bool, len(p.MetadataSeparators))
	for _, pattern := range p.MetadataSeparators {
		if patternList[pattern] {
			// Ignore further, duplicated entries
			continue
		}
		patternList[pattern] = true
		p.metadataSeparatorList = append(p.metadataSeparatorList, pattern)
	}
	sort.Stable(p.metadataSeparatorList)

	return nil
}

func (p *Parser) parseMetadataRow(haystack string) map[string]string {
	haystack = strings.TrimRight(haystack, "\r\n")
	for _, needle := range p.metadataSeparatorList {
		metadata := strings.SplitN(haystack, needle, 2)
		if len(metadata) < 2 {
			continue
		}
		key := strings.Trim(metadata[0], p.MetadataTrimSet)
		if len(key) > 0 {
			value := strings.Trim(metadata[1], p.MetadataTrimSet)
			return map[string]string{key: value}
		}
	}
	return nil
}

func (p *Parser) Reset() {
	// Reset the columns if they were not user-specified
	p.gotColumnNames = p.gotInitialColumnNames
	if !p.gotInitialColumnNames {
		p.ColumnNames = nil
	}

	// Reset the internal counters
	p.remainingSkipRows = p.SkipRows
	p.remainingHeaderRows = p.HeaderRowCount
	p.remainingMetadataRows = p.MetadataRows
}

func (p *Parser) Init() error {
	if p.HeaderRowCount == 0 && len(p.ColumnNames) == 0 {
		return errors.New("`csv_header_row_count` must be defined if `csv_column_names` is not specified")
	}

	if p.Delimiter != "" {
		runeStr := []rune(p.Delimiter)
		if len(runeStr) > 1 {
			return fmt.Errorf("csv_delimiter must be a single character, got: %s", p.Delimiter)
		}
		p.invalidDelimiter = !validDelim(runeStr[0])
	}

	if p.Comment != "" {
		runeStr := []rune(p.Comment)
		if len(runeStr) > 1 {
			return fmt.Errorf("csv_delimiter must be a single character, got: %s", p.Comment)
		}
	}

	p.gotInitialColumnNames = len(p.ColumnNames) > 0
	if len(p.ColumnNames) > 0 && len(p.ColumnTypes) > 0 && len(p.ColumnNames) != len(p.ColumnTypes) {
		return errors.New("csv_column_names field count doesn't match with csv_column_types")
	}

	if err := p.initializeMetadataSeparators(); err != nil {
		return fmt.Errorf("initializing separators failed: %w", err)
	}

	if p.TimeFunc == nil {
		p.TimeFunc = time.Now
	}

	if p.Timezone != "" {
		loc, err := time.LoadLocation(p.Timezone)
		if err != nil {
			return fmt.Errorf("invalid timezone: %w", err)
		}
		p.location = loc
	}

	if p.ResetMode == "" {
		p.ResetMode = "none"
	}
	if !choice.Contains(p.ResetMode, []string{"none", "always"}) {
		return fmt.Errorf("unknown reset mode %q", p.ResetMode)
	}
	p.Reset()

	return nil
}

func (p *Parser) SetTimeFunc(fn TimeFunc) {
	p.TimeFunc = fn
}

func (p *Parser) compile(r io.Reader) *csv.Reader {
	csvReader := csv.NewReader(r)
	// ensures that the reader reads records of different lengths without an error
	csvReader.FieldsPerRecord = -1
	if !p.invalidDelimiter && p.Delimiter != "" {
		csvReader.Comma, _ = utf8.DecodeRuneInString(p.Delimiter)
	}
	// Check if delimiter is invalid
	if p.invalidDelimiter && p.Delimiter != "" {
		csvReader.Comma, _ = utf8.DecodeRuneInString(commaByte)
	}
	if p.Comment != "" {
		csvReader.Comment, _ = utf8.DecodeRuneInString(p.Comment)
	}
	csvReader.TrimLeadingSpace = p.TrimSpace

	return csvReader
}

// Taken from upstream Golang code see
// https://github.com/golang/go/blob/release-branch.go1.19/src/encoding/csv/reader.go#L95
func validDelim(r rune) bool {
	return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
}

func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
	// Reset the parser according to the specified mode
	if p.ResetMode == "always" {
		p.Reset()
	}
	// If using an invalid delimiter, replace commas with replacement and
	// invalid delimiter with commas
	if p.invalidDelimiter {
		buf = bytes.Replace(buf, []byte(commaByte), []byte(replacementByte), -1)
		buf = bytes.Replace(buf, []byte(p.Delimiter), []byte(commaByte), -1)
	}
	r := bytes.NewReader(buf)
	metrics, err := parseCSV(p, r)
	if err != nil && errors.Is(err, io.EOF) {
		return nil, parsers.ErrEOF
	}
	return metrics, err
}

func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
	if len(line) == 0 {
		if p.remainingSkipRows > 0 {
			p.remainingSkipRows--
			return nil, parsers.ErrEOF
		}
		if p.remainingMetadataRows > 0 {
			p.remainingMetadataRows--
			return nil, parsers.ErrEOF
		}
	}
	r := bytes.NewReader([]byte(line))
	metrics, err := parseCSV(p, r)
	if err != nil {
		if errors.Is(err, io.EOF) {
			return nil, parsers.ErrEOF
		}
		return nil, err
	}
	if len(metrics) == 1 {
		return metrics[0], nil
	}
	if len(metrics) > 1 {
		return nil, fmt.Errorf("expected 1 metric found %d", len(metrics))
	}
	return nil, nil
}

func parseCSV(p *Parser, r io.Reader) ([]telegraf.Metric, error) {
	lineReader := bufio.NewReader(r)
	// skip first rows
	for p.remainingSkipRows > 0 {
		line, err := lineReader.ReadString('\n')
		if err != nil && len(line) == 0 {
			return nil, err
		}
		p.remainingSkipRows--
	}
	// Parse metadata
	for p.remainingMetadataRows > 0 {
		line, err := lineReader.ReadString('\n')
		if err != nil && len(line) == 0 {
			return nil, err
		}
		p.remainingMetadataRows--
		m := p.parseMetadataRow(line)
		for k, v := range m {
			p.metadataTags[k] = v
		}
	}
	csvReader := p.compile(lineReader)
	// if there is a header, and we did not get DataColumns
	// set DataColumns to names extracted from the header
	// we always reread the header to avoid side effects
	// in cases where multiple files with different
	// headers are read
	for p.remainingHeaderRows > 0 {
		header, err := csvReader.Read()
		if err != nil {
			return nil, err
		}
		p.remainingHeaderRows--
		if p.gotColumnNames {
			// Ignore header lines if columns are named
			continue
		}
		// concatenate header names
		for i, name := range header {
			if p.TrimSpace {
				name = strings.Trim(name, " ")
			}
			if len(p.ColumnNames) <= i {
				p.ColumnNames = append(p.ColumnNames, name)
			} else {
				p.ColumnNames[i] = p.ColumnNames[i] + name
			}
		}
	}
	if !p.gotColumnNames {
		// skip first rows
		p.ColumnNames = p.ColumnNames[p.SkipColumns:]
		p.gotColumnNames = true
	}

	table, err := csvReader.ReadAll()
	if err != nil {
		return nil, err
	}

	metrics := make([]telegraf.Metric, 0)
	for _, record := range table {
		m, err := p.parseRecord(record)
		if err != nil {
			if p.SkipErrors {
				p.Log.Debugf("Parsing error: %v", err)
				continue
			}
			return metrics, err
		}
		metrics = append(metrics, m)
	}
	return metrics, nil
}

func (p *Parser) parseRecord(record []string) (telegraf.Metric, error) {
	recordFields := make(map[string]interface{})
	tags := make(map[string]string)

	if p.TagOverwrite {
		// add default tags
		for k, v := range p.DefaultTags {
			tags[k] = v
		}

		// add metadata tags
		for k, v := range p.metadataTags {
			tags[k] = v
		}
	}

	// skip columns in record
	record = record[p.SkipColumns:]
outer:
	for i, fieldName := range p.ColumnNames {
		if i < len(record) {
			value := record[i]
			if p.TrimSpace {
				value = strings.Trim(value, " ")
			}

			// don't record fields where the value matches a skip value
			for _, s := range p.SkipValues {
				if value == s {
					continue outer
				}
			}

			for _, tagName := range p.TagColumns {
				if tagName == fieldName {
					tags[tagName] = value
					continue outer
				}
			}

			// If the field name is the timestamp column, then keep field name as is.
			if fieldName == p.TimestampColumn {
				recordFields[fieldName] = value
				continue
			}

			// Try explicit conversion only when column types is defined.
			if len(p.ColumnTypes) > 0 {
				// Throw error if current column count exceeds defined types.
				if i >= len(p.ColumnTypes) {
					return nil, errors.New("column type: column count exceeded")
				}

				var val interface{}
				var err error

				switch p.ColumnTypes[i] {
				case "int":
					val, err = strconv.ParseInt(value, 10, 64)
					if err != nil {
						return nil, fmt.Errorf("column type: parse int error %w", err)
					}
				case "float":
					val, err = strconv.ParseFloat(value, 64)
					if err != nil {
						return nil, fmt.Errorf("column type: parse float error %w", err)
					}
				case "bool":
					val, err = strconv.ParseBool(value)
					if err != nil {
						return nil, fmt.Errorf("column type: parse bool error %w", err)
					}
				default:
					val = value
				}

				recordFields[fieldName] = val
				continue
			}

			// attempt type conversions
			if iValue, err := strconv.ParseInt(value, 10, 64); err == nil {
				recordFields[fieldName] = iValue
			} else if fValue, err := strconv.ParseFloat(value, 64); err == nil {
				recordFields[fieldName] = fValue
			} else if bValue, err := strconv.ParseBool(value); err == nil {
				recordFields[fieldName] = bValue
			} else {
				recordFields[fieldName] = value
			}
		}
	}

	if !p.TagOverwrite {
		// add metadata tags
		for k, v := range p.metadataTags {
			tags[k] = v
		}

		// add default tags
		for k, v := range p.DefaultTags {
			tags[k] = v
		}
	}

	// will default to plugin name
	measurementName := p.MetricName
	if p.MeasurementColumn != "" {
		if recordFields[p.MeasurementColumn] != nil && recordFields[p.MeasurementColumn] != "" {
			measurementName = fmt.Sprintf("%v", recordFields[p.MeasurementColumn])
		}
	}

	metricTime, err := parseTimestamp(p.TimeFunc, recordFields, p.TimestampColumn, p.TimestampFormat, p.location)
	if err != nil {
		return nil, err
	}

	// Exclude `TimestampColumn` and `MeasurementColumn`
	delete(recordFields, p.TimestampColumn)
	delete(recordFields, p.MeasurementColumn)

	m := metric.New(measurementName, tags, recordFields, metricTime)

	return m, nil
}

// ParseTimestamp return a timestamp, if there is no timestamp on the csv it
// will be the current timestamp, else it will try to parse the time according
// to the format.
func parseTimestamp(timeFunc func() time.Time, recordFields map[string]interface{},
	timestampColumn, timestampFormat string, timezone *time.Location,
) (time.Time, error) {
	if timestampColumn != "" {
		if recordFields[timestampColumn] == nil {
			return time.Time{}, fmt.Errorf("timestamp column: %v could not be found", timestampColumn)
		}

		switch timestampFormat {
		case "":
			return time.Time{}, errors.New("timestamp format must be specified")
		default:
			metricTime, err := internal.ParseTimestamp(timestampFormat, recordFields[timestampColumn], timezone)
			if err != nil {
				return time.Time{}, err
			}
			return metricTime, err
		}
	}

	return timeFunc(), nil
}

// SetDefaultTags set the DefaultTags
func (p *Parser) SetDefaultTags(tags map[string]string) {
	p.DefaultTags = tags
}

func init() {
	parsers.Add("csv",
		func(defaultMetricName string) telegraf.Parser {
			return &Parser{MetricName: defaultMetricName}
		})
}