1
0
Fork 0

Adding upstream version 1.34.4.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-24 07:26:29 +02:00
parent e393c3af3f
commit 4978089aab
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
4963 changed files with 677545 additions and 0 deletions

View file

@ -0,0 +1,51 @@
# Parquet Parser Plugin
The Parquet parser allows for the parsing of Parquet files that were read in.
## Configuration
```toml
[[inputs.file]]
files = ["example"]
## Data format to consume.
## Each data format has its own unique set of configuration options, read
## more about them here:
## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md
data_format = "parquet"
## Tag column is an array of columns that should be added as tags.
# tag_columns = []
## Name column is the column to use as the measurement name.
# measurement_column = ""
## Timestamp column is the column containing the time that should be used to
## create the metric. If not set, then the time of parsing is used.
# timestamp_column = ""
## Timestamp format is the time layout that should be used to interpret the
## timestamp_column. The time must be `unix`, `unix_ms`, `unix_us`, `unix_ns`,
## or a time in the "reference time". To define a different format, arrange
## the values from the "reference time" in the example to match the format
## you will be using. For more information on the "reference time", visit
## https://golang.org/pkg/time/#Time.Format
## ex: timestamp_format = "Mon Jan 2 15:04:05 -0700 MST 2006"
## timestamp_format = "2006-01-02T15:04:05Z07:00"
## timestamp_format = "01/02/2006 15:04:05"
## timestamp_format = "unix"
## timestamp_format = "unix_ms"
# timestamp_format = ""
## Timezone allows you to provide an override for timestamps that
## do not already include an offset
## e.g. 04/06/2016 12:41:45
##
## Default: "" which renders UTC
## Options are as follows:
## 1. Local -- interpret based on machine localtime
## 2. "America/New_York" -- Unix TZ values like those found in
## https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
## 3. UTC -- or blank/unspecified, will return timestamp in UTC
# timestamp_timezone = ""
```

View file

@ -0,0 +1,126 @@
package parquet
import (
"reflect"
"github.com/apache/arrow-go/v18/parquet"
"github.com/apache/arrow-go/v18/parquet/file"
)
func newColumnParser(reader file.ColumnChunkReader) *columnParser {
batchSize := 128
var valueBuffer interface{}
switch reader.(type) {
case *file.BooleanColumnChunkReader:
valueBuffer = make([]bool, batchSize)
case *file.Int32ColumnChunkReader:
valueBuffer = make([]int32, batchSize)
case *file.Int64ColumnChunkReader:
valueBuffer = make([]int64, batchSize)
case *file.Float32ColumnChunkReader:
valueBuffer = make([]float32, batchSize)
case *file.Float64ColumnChunkReader:
valueBuffer = make([]float64, batchSize)
case *file.ByteArrayColumnChunkReader:
valueBuffer = make([]parquet.ByteArray, batchSize)
case *file.FixedLenByteArrayColumnChunkReader:
valueBuffer = make([]parquet.FixedLenByteArray, batchSize)
}
return &columnParser{
name: reader.Descriptor().Name(),
reader: reader,
batchSize: int64(batchSize),
defLevels: make([]int16, batchSize),
repLevels: make([]int16, batchSize),
valueBuffer: valueBuffer,
}
}
type columnParser struct {
name string
reader file.ColumnChunkReader
batchSize int64
valueOffset int
valuesBuffered int
levelOffset int64
levelsBuffered int64
defLevels []int16
repLevels []int16
valueBuffer interface{}
}
func (c *columnParser) readNextBatch() error {
var err error
switch reader := c.reader.(type) {
case *file.BooleanColumnChunkReader:
values := c.valueBuffer.([]bool)
c.levelsBuffered, c.valuesBuffered, err = reader.ReadBatch(c.batchSize, values, c.defLevels, c.repLevels)
case *file.Int32ColumnChunkReader:
values := c.valueBuffer.([]int32)
c.levelsBuffered, c.valuesBuffered, err = reader.ReadBatch(c.batchSize, values, c.defLevels, c.repLevels)
case *file.Int64ColumnChunkReader:
values := c.valueBuffer.([]int64)
c.levelsBuffered, c.valuesBuffered, err = reader.ReadBatch(c.batchSize, values, c.defLevels, c.repLevels)
case *file.Float32ColumnChunkReader:
values := c.valueBuffer.([]float32)
c.levelsBuffered, c.valuesBuffered, err = reader.ReadBatch(c.batchSize, values, c.defLevels, c.repLevels)
case *file.Float64ColumnChunkReader:
values := c.valueBuffer.([]float64)
c.levelsBuffered, c.valuesBuffered, err = reader.ReadBatch(c.batchSize, values, c.defLevels, c.repLevels)
case *file.ByteArrayColumnChunkReader:
values := c.valueBuffer.([]parquet.ByteArray)
c.levelsBuffered, c.valuesBuffered, err = reader.ReadBatch(c.batchSize, values, c.defLevels, c.repLevels)
case *file.FixedLenByteArrayColumnChunkReader:
values := c.valueBuffer.([]parquet.FixedLenByteArray)
c.levelsBuffered, c.valuesBuffered, err = reader.ReadBatch(c.batchSize, values, c.defLevels, c.repLevels)
}
c.valueOffset = 0
c.levelOffset = 0
return err
}
func (c *columnParser) HasNext() bool {
return c.levelOffset < c.levelsBuffered || c.reader.HasNext()
}
func (c *columnParser) Next() (interface{}, bool) {
if c.levelOffset == c.levelsBuffered {
if !c.HasNext() {
return nil, false
}
if err := c.readNextBatch(); err != nil {
return nil, false
}
if c.levelsBuffered == 0 {
return nil, false
}
}
defLevel := c.defLevels[int(c.levelOffset)]
c.levelOffset++
if defLevel < c.reader.Descriptor().MaxDefinitionLevel() {
return nil, true
}
vb := reflect.ValueOf(c.valueBuffer)
val := vb.Index(c.valueOffset).Interface()
c.valueOffset++
// Convert byte arrays to strings
switch v := val.(type) {
case parquet.ByteArray:
val = string(v)
case parquet.FixedLenByteArray:
val = string(v)
}
return val, true
}

View file

@ -0,0 +1,149 @@
package parquet
import (
"bytes"
"errors"
"fmt"
"slices"
"time"
"github.com/apache/arrow-go/v18/parquet/file"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/metric"
"github.com/influxdata/telegraf/plugins/parsers"
)
type Parser struct {
MeasurementColumn string `toml:"measurement_column"`
TagColumns []string `toml:"tag_columns"`
TimestampColumn string `toml:"timestamp_column"`
TimestampFormat string `toml:"timestamp_format"`
TimestampTimezone string `toml:"timestamp_timezone"`
defaultTags map[string]string
location *time.Location
metricName string
}
func (p *Parser) Init() error {
if p.TimestampFormat == "" {
p.TimestampFormat = "unix"
}
if p.TimestampTimezone == "" {
p.location = time.UTC
} else {
loc, err := time.LoadLocation(p.TimestampTimezone)
if err != nil {
return fmt.Errorf("invalid location %s: %w", p.TimestampTimezone, err)
}
p.location = loc
}
return nil
}
func (p *Parser) Parse(buf []byte) ([]telegraf.Metric, error) {
reader := bytes.NewReader(buf)
parquetReader, err := file.NewParquetReader(reader)
if err != nil {
return nil, fmt.Errorf("unable to create parquet reader: %w", err)
}
metadata := parquetReader.MetaData()
now := time.Now()
metrics := make([]telegraf.Metric, 0, metadata.NumRows)
for i := 0; i < parquetReader.NumRowGroups(); i++ {
rowGroup := parquetReader.RowGroup(i)
scanners := make([]*columnParser, metadata.Schema.NumColumns())
for colIndex := range metadata.Schema.NumColumns() {
col, err := rowGroup.Column(colIndex)
if err != nil {
return nil, fmt.Errorf("unable to fetch column %q: %w", colIndex, err)
}
scanners[colIndex] = newColumnParser(col)
}
rowIndex := 0
rowGroupMetrics := make([]telegraf.Metric, rowGroup.NumRows())
for _, s := range scanners {
for s.HasNext() {
if rowIndex%int(rowGroup.NumRows()) == 0 {
rowIndex = 0
}
val, ok := s.Next()
if !ok || val == nil {
rowIndex++
continue
}
if rowGroupMetrics[rowIndex] == nil {
rowGroupMetrics[rowIndex] = metric.New(p.metricName, p.defaultTags, nil, now)
}
if p.MeasurementColumn != "" && s.name == p.MeasurementColumn {
valStr, err := internal.ToString(val)
if err != nil {
return nil, fmt.Errorf("could not convert value to string: %w", err)
}
rowGroupMetrics[rowIndex].SetName(valStr)
} else if p.TagColumns != nil && slices.Contains(p.TagColumns, s.name) {
valStr, err := internal.ToString(val)
if err != nil {
return nil, fmt.Errorf("could not convert value to string: %w", err)
}
rowGroupMetrics[rowIndex].AddTag(s.name, valStr)
} else if p.TimestampColumn != "" && s.name == p.TimestampColumn {
valStr, err := internal.ToString(val)
if err != nil {
return nil, fmt.Errorf("could not convert value to string: %w", err)
}
timestamp, err := internal.ParseTimestamp(p.TimestampFormat, valStr, p.location)
if err != nil {
return nil, fmt.Errorf("could not parse '%s' to '%s'", valStr, p.TimestampFormat)
}
rowGroupMetrics[rowIndex].SetTime(timestamp)
} else {
rowGroupMetrics[rowIndex].AddField(s.name, val)
}
rowIndex++
}
}
metrics = append(metrics, rowGroupMetrics...)
}
return metrics, nil
}
func (p *Parser) ParseLine(line string) (telegraf.Metric, error) {
metrics, err := p.Parse([]byte(line))
if err != nil {
return nil, err
}
if len(metrics) < 1 {
return nil, nil
}
if len(metrics) > 1 {
return nil, errors.New("line contains multiple metrics")
}
return metrics[0], nil
}
func (p *Parser) SetDefaultTags(tags map[string]string) {
p.defaultTags = tags
}
func init() {
parsers.Add("parquet",
func(defaultMetricName string) telegraf.Parser {
return &Parser{metricName: defaultMetricName}
},
)
}

View file

@ -0,0 +1,76 @@
package parquet
import (
"os"
"path/filepath"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/stretchr/testify/require"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/testutil"
test "github.com/influxdata/telegraf/testutil/plugin_input"
)
func TestCases(t *testing.T) {
folders, err := os.ReadDir("testcases")
require.NoError(t, err)
require.NotEmpty(t, folders)
for _, f := range folders {
testcasePath := filepath.Join("testcases", f.Name())
configFilename := filepath.Join(testcasePath, "telegraf.conf")
t.Run(f.Name(), func(t *testing.T) {
// Configure the plugin
cfg := config.NewConfig()
require.NoError(t, cfg.LoadConfig(configFilename))
require.NoError(t, err)
require.Len(t, cfg.Inputs, 1)
// Tune the test-plugin
plugin := cfg.Inputs[0].Input.(*test.Plugin)
plugin.Path = testcasePath
require.NoError(t, plugin.Init())
// Gather the metrics and check for potential errors
var acc testutil.Accumulator
err := plugin.Gather(&acc)
switch len(plugin.ExpectedErrors) {
case 0:
require.NoError(t, err)
case 1:
require.ErrorContains(t, err, plugin.ExpectedErrors[0])
default:
require.Contains(t, plugin.ExpectedErrors, err.Error())
}
// Determine checking options
options := []cmp.Option{
cmpopts.EquateApprox(0, 1e-6),
testutil.SortMetrics(),
}
if plugin.ShouldIgnoreTimestamp {
options = append(options, testutil.IgnoreTime())
}
// Process expected metrics and compare with resulting metrics
actual := acc.GetTelegrafMetrics()
testutil.RequireMetricsEqual(t, plugin.Expected, actual, options...)
})
}
}
func BenchmarkParsing(b *testing.B) {
plugin := &Parser{}
benchmarkData, err := os.ReadFile("testcases/benchmark/input.parquet")
require.NoError(b, err)
b.ResetTimer()
for n := 0; n < b.N; n++ {
//nolint:errcheck // Benchmarking so skip the error check to avoid the unnecessary operations
plugin.Parse(benchmarkData)
}
}

View file

@ -0,0 +1 @@
test value=42i 1710683608143228692

View file

@ -0,0 +1,11 @@
#!/usr/bin/env python
import pandas
import pyarrow
import pyarrow.parquet
df = pandas.DataFrame({
'value': [42],
'timestamp': ["1710683608143228692"]
})
pyarrow.parquet.write_table(pyarrow.Table.from_pandas(df), "input.parquet")

View file

@ -0,0 +1,6 @@
[[inputs.test]]
files = ["input.parquet"]
data_format = "parquet"
timestamp_column = "timestamp"
timestamp_format = "unix_ns"

View file

@ -0,0 +1,2 @@
row1,byteArray=Short,fixedLengthByteArray=STRING boolean=true,float32=1,float64=64,int32=-2147483648i,int64=-9223372036854775808i 1710697199000000000
row2,byteArray=Much\ longer\ string\ here...,fixedLengthByteArray=FOOBAR boolean=false,float32=1.1234568357467651,float64=65.1234567891212,int32=2147483647i,int64=9223372036854775807i 551812924000000000

View file

@ -0,0 +1,33 @@
#!/usr/bin/env python
import pandas
import pyarrow
import pyarrow.parquet
df = pandas.DataFrame({
'boolean': [True, False],
'string': ["row1", "row2"],
'int32': [-2147483648, 2147483647],
'int64': [-9223372036854775808, 9223372036854775807],
'float32': [1.000000001, 1.123456789],
'float64': [64.00000000000000001, 65.12345678912121212],
'byteArray': ["Short", "Much longer string here..."],
'fixedLengthByteArray': ["STRING", "FOOBAR"],
'timestamp': [
"Sun, 17 Mar 2024 10:39:59 MST",
"Sat, 27 Jun 1987 10:22:04 MST",
]
})
schema = pyarrow.schema([
pyarrow.field('boolean', pyarrow.bool_()),
pyarrow.field('string', pyarrow.string()),
pyarrow.field('int32', pyarrow.int32()),
pyarrow.field('int64', pyarrow.int64()),
pyarrow.field('float32', pyarrow.float32()),
pyarrow.field('float64', pyarrow.float64()),
pyarrow.field('byteArray', pyarrow.binary()),
pyarrow.field('fixedLengthByteArray', pyarrow.binary(6)),
pyarrow.field('timestamp', pyarrow.binary())
])
pyarrow.parquet.write_table(pyarrow.Table.from_pandas(df, schema), "input.parquet")

View file

@ -0,0 +1,9 @@
[[inputs.test]]
files = ["input.parquet"]
data_format = "parquet"
measurement_column = "string"
tag_columns = ["byteArray", "fixedLengthByteArray"]
timestamp_column = "timestamp"
timestamp_format = "Mon, 02 Jan 2006 15:04:05 MST"
timestamp_timezone = "MST"

View file

@ -0,0 +1,7 @@
a,tag=row1 float_field=64 1710683695000000000
b,tag=row1 float_field=65 1710683695000000000
c,tag=row1 float_field=66 1710683695000000000
d,tag=row1 float_field=67 1710683695000000000
e,tag=row1 float_field=68 1710683695000000000
f,tag=row1 float_field=69 1710683695000000000
g,tag=row1 float_field=70 1710683695000000000

View file

@ -0,0 +1,16 @@
#!/usr/bin/env python
import pandas
import pyarrow
import pyarrow.parquet
df = pandas.DataFrame({
'tag': ["row1", "row1", "row1", "row1", "row1", "row1", "row1"],
'float_field': [64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0],
'str_field': ["a", "b", "c", "d", "e", "f", "g"],
'timestamp': [
1710683695, 1710683695, 1710683695, 1710683695, 1710683695,
1710683695, 1710683695,
]
})
pyarrow.parquet.write_table(pyarrow.Table.from_pandas(df), "input.parquet")

Binary file not shown.

View file

@ -0,0 +1,8 @@
[[inputs.test]]
files = ["input.parquet"]
data_format = "parquet"
measurement_column = "str_field"
tag_columns = ["tag"]
timestamp_column = "timestamp"
timestamp_format = "unix"

View file

@ -0,0 +1,6 @@
#!/usr/bin/env python
import pandas
import pyarrow
import pyarrow.parquet
pyarrow.parquet.write_table(pyarrow.Table.from_pandas(pandas.DataFrame()), "input.parquet")

Binary file not shown.

View file

@ -0,0 +1,3 @@
[[inputs.test]]
files = ["input.parquet"]
data_format = "parquet"

View file

@ -0,0 +1,21 @@
test,tag=row1 float_field=64 1710683608143228692
test,tag=row1 float_field=65 1710683608143228692
test,tag=row1 float_field=66 1710683608143228692
test,tag=row1 float_field=67 1710683608143228692
test,tag=row1 float_field=68 1710683608143228692
test,tag=row1 float_field=69 1710683608143228692
test,tag=row1 float_field=70 1710683608143228692
test,tag=row1 float_field=64 1710683608143228693
test,tag=row1 float_field=65 1710683608143228693
test,tag=row1 float_field=66 1710683608143228693
test,tag=row1 float_field=67 1710683608143228693
test,tag=row1 float_field=68 1710683608143228693
test,tag=row1 float_field=69 1710683608143228693
test,tag=row1 float_field=70 1710683608143228693
test,tag=row1 float_field=64 1710683608143228694
test,tag=row1 float_field=65 1710683608143228694
test,tag=row1 float_field=66 1710683608143228694
test,tag=row1 float_field=67 1710683608143228694
test,tag=row1 float_field=68 1710683608143228694
test,tag=row1 float_field=69 1710683608143228694
test,tag=row1 float_field=70 1710683608143228694

View file

@ -0,0 +1,39 @@
#!/usr/bin/env python
import pandas
import pyarrow
import pyarrow.parquet
df1 = pandas.DataFrame({
'tag': ["row1", "row1", "row1", "row1", "row1", "row1", "row1"],
'float_field': [64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0],
'timestamp': [
"1710683608143228692", "1710683608143228692", "1710683608143228692",
"1710683608143228692", "1710683608143228692", "1710683608143228692",
"1710683608143228692",
]
})
df2 = pandas.DataFrame({
'tag': ["row1", "row1", "row1", "row1", "row1", "row1", "row1"],
'float_field': [64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0],
'timestamp': [
"1710683608143228693", "1710683608143228693", "1710683608143228693",
"1710683608143228693", "1710683608143228693", "1710683608143228693",
"1710683608143228693",
]
})
df3 = pandas.DataFrame({
'tag': ["row1", "row1", "row1", "row1", "row1", "row1", "row1"],
'float_field': [64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0],
'timestamp': [
"1710683608143228694", "1710683608143228694", "1710683608143228694",
"1710683608143228694", "1710683608143228694", "1710683608143228694",
"1710683608143228694",
]
})
with pyarrow.parquet.ParquetWriter('input.parquet', pyarrow.Table.from_pandas(df1).schema) as writer:
writer.write_table(pyarrow.Table.from_pandas(df1))
writer.write_table(pyarrow.Table.from_pandas(df2))
writer.write_table(pyarrow.Table.from_pandas(df3))

View file

@ -0,0 +1,8 @@
[[inputs.test]]
files = ["input.parquet"]
data_format = "parquet"
measurement_column = "str_field"
tag_columns = ["tag"]
timestamp_column = "timestamp"
timestamp_format = "unix_ns"

View file

@ -0,0 +1,6 @@
test,tag=row1 float_field=64 1709313032000000000
test,tag=row2 float_field=65 1709399432000000000
test,tag=row3 int_field=65 1709485832000000000
test,tag=row4 uint_field=5 1709572232000000000
test,tag=row5 bool_field=true 1709658632000000000
test,str_field=blargh,tag=multi_field bool_field=false 1709831432000000000

View file

@ -0,0 +1,20 @@
#!/usr/bin/env python
import pandas
import pyarrow
import pyarrow.parquet
df = pandas.DataFrame({
'tag': ["row1", "row2", "row3", "row4", "row5", "row6", "multi_field"],
'float_field': [64.0, 65.0, None, None, None, None, None],
'int_field': [None, None, 65, None, None, None, None],
'uint_field': [None, None, None, 5, None, None, None],
'bool_field': [None, None, None, None, True, None, False],
'str_field': [None, None, None, None, None, "blargh", "blargh"],
'timestamp': [
"2024-03-01T17:10:32", "2024-03-02T17:10:32", "2024-03-03T17:10:32",
"2024-03-04T17:10:32", "2024-03-05T17:10:32", "2024-03-06T17:10:32",
"2024-03-07T17:10:32",
]
})
pyarrow.parquet.write_table(pyarrow.Table.from_pandas(df), "input.parquet")

Binary file not shown.

View file

@ -0,0 +1,7 @@
[[inputs.test]]
files = ["input.parquet"]
data_format = "parquet"
tag_columns = ["tag", "str_field"]
timestamp_column = "timestamp"
timestamp_format = "2006-01-02T15:04:05"

View file

@ -0,0 +1,3 @@
test value=1.1 1710511506000000000
test value=2.2 1710597906000000000
test value=3.3 1710684306000000000

View file

@ -0,0 +1,14 @@
#!/usr/bin/env python
import pandas
import pyarrow
import pyarrow.parquet
df = pandas.DataFrame({
'value': [1.1, 2.2, 3.3],
'timestamp': [
"2024-03-15T14:05:06+00:00", "2024-03-16T14:05:06+00:00",
"2024-03-17T14:05:06+00:00",
]
})
pyarrow.parquet.write_table(pyarrow.Table.from_pandas(df), "input.parquet")

View file

@ -0,0 +1,7 @@
[[inputs.test]]
files = ["input.parquet"]
data_format = "parquet"
tag_columns = ["tag", "str_field"]
timestamp_column = "timestamp"
timestamp_format = "2006-01-02T15:04:05Z07:00"