Adding upstream version 1.34.4.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
e393c3af3f
commit
4978089aab
4963 changed files with 677545 additions and 0 deletions
77
plugins/inputs/ras/README.md
Normal file
77
plugins/inputs/ras/README.md
Normal file
|
@ -0,0 +1,77 @@
|
|||
# RAS Daemon Input Plugin
|
||||
|
||||
This plugin is only available on Linux (only for `386`, `amd64`, `arm` and
|
||||
`arm64` architectures).
|
||||
|
||||
The `RAS` plugin gathers and counts errors provided by
|
||||
[RASDaemon](https://github.com/mchehab/rasdaemon).
|
||||
|
||||
## Global configuration options <!-- @/docs/includes/plugin_config.md -->
|
||||
|
||||
In addition to the plugin-specific configuration settings, plugins support
|
||||
additional global and plugin configuration settings. These settings are used to
|
||||
modify metrics, tags, and field or create aliases and configure ordering, etc.
|
||||
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.
|
||||
|
||||
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins
|
||||
|
||||
## Configuration
|
||||
|
||||
```toml @sample.conf
|
||||
# RAS plugin exposes counter metrics for Machine Check Errors provided by RASDaemon (sqlite3 output is required).
|
||||
# This plugin ONLY supports Linux on 386, amd64, arm, and arm64
|
||||
[[inputs.ras]]
|
||||
## Optional path to RASDaemon sqlite3 database.
|
||||
## Default: /var/lib/rasdaemon/ras-mc_event.db
|
||||
# db_path = ""
|
||||
```
|
||||
|
||||
In addition `RASDaemon` runs, by default, with `--enable-sqlite3` flag. In case
|
||||
of problems with SQLite3 database please verify this is still a default option.
|
||||
|
||||
## Metrics
|
||||
|
||||
- ras
|
||||
- tags:
|
||||
- socket_id
|
||||
- fields:
|
||||
- memory_read_corrected_errors
|
||||
- memory_read_uncorrectable_errors
|
||||
- memory_write_corrected_errors
|
||||
- memory_write_uncorrectable_errors
|
||||
- cache_l0_l1_errors
|
||||
- tlb_instruction_errors
|
||||
- cache_l2_errors
|
||||
- upi_errors
|
||||
- processor_base_errors
|
||||
- processor_bus_errors
|
||||
- internal_timer_errors
|
||||
- smm_handler_code_access_violation_errors
|
||||
- internal_parity_errors
|
||||
- frc_errors
|
||||
- external_mce_errors
|
||||
- microcode_rom_parity_errors
|
||||
- unclassified_mce_errors
|
||||
|
||||
Please note that `processor_base_errors` is aggregate counter measuring the
|
||||
following MCE events:
|
||||
|
||||
- internal_timer_errors
|
||||
- smm_handler_code_access_violation_errors
|
||||
- internal_parity_errors
|
||||
- frc_errors
|
||||
- external_mce_errors
|
||||
- microcode_rom_parity_errors
|
||||
- unclassified_mce_errors
|
||||
|
||||
## Permissions
|
||||
|
||||
This plugin requires access to SQLite3 database from `RASDaemon`. Please make
|
||||
sure that user has required permissions to this database.
|
||||
|
||||
## Example Output
|
||||
|
||||
```text
|
||||
ras,host=ubuntu,socket_id=0 external_mce_base_errors=1i,frc_errors=1i,instruction_tlb_errors=5i,internal_parity_errors=1i,internal_timer_errors=1i,l0_and_l1_cache_errors=7i,memory_read_corrected_errors=25i,memory_read_uncorrectable_errors=0i,memory_write_corrected_errors=5i,memory_write_uncorrectable_errors=0i,microcode_rom_parity_errors=1i,processor_base_errors=7i,processor_bus_errors=1i,smm_handler_code_access_violation_errors=1i,unclassified_mce_base_errors=1i 1598867393000000000
|
||||
ras,host=ubuntu level_2_cache_errors=0i,upi_errors=0i 1598867393000000000
|
||||
```
|
335
plugins/inputs/ras/ras.go
Normal file
335
plugins/inputs/ras/ras.go
Normal file
|
@ -0,0 +1,335 @@
|
|||
//go:generate ../../../tools/readme_config_includer/generator
|
||||
//go:build linux && (386 || amd64 || arm || arm64)
|
||||
|
||||
package ras
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
_ "embed"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
// Required for SQL framework driver
|
||||
_ "modernc.org/sqlite"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
//go:embed sample.conf
|
||||
var sampleConfig string
|
||||
|
||||
const (
|
||||
mceQuery = `
|
||||
SELECT
|
||||
id, timestamp, error_msg, mcistatus_msg, socketid
|
||||
FROM mce_record
|
||||
WHERE timestamp > ?
|
||||
`
|
||||
defaultDBPath = "/var/lib/rasdaemon/ras-mc_event.db"
|
||||
dateLayout = "2006-01-02 15:04:05 -0700"
|
||||
memoryReadCorrected = "memory_read_corrected_errors"
|
||||
memoryReadUncorrected = "memory_read_uncorrectable_errors"
|
||||
memoryWriteCorrected = "memory_write_corrected_errors"
|
||||
memoryWriteUncorrected = "memory_write_uncorrectable_errors"
|
||||
instructionCache = "cache_l0_l1_errors"
|
||||
instructionTLB = "tlb_instruction_errors"
|
||||
levelTwoCache = "cache_l2_errors"
|
||||
upi = "upi_errors"
|
||||
processorBase = "processor_base_errors"
|
||||
processorBus = "processor_bus_errors"
|
||||
internalTimer = "internal_timer_errors"
|
||||
smmHandlerCode = "smm_handler_code_access_violation_errors"
|
||||
internalParity = "internal_parity_errors"
|
||||
frc = "frc_errors"
|
||||
externalMCEBase = "external_mce_errors"
|
||||
microcodeROMParity = "microcode_rom_parity_errors"
|
||||
unclassifiedMCEBase = "unclassified_mce_errors"
|
||||
)
|
||||
|
||||
type Ras struct {
|
||||
DBPath string `toml:"db_path"`
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
|
||||
db *sql.DB
|
||||
latestTimestamp time.Time
|
||||
cpuSocketCounters map[int]metricCounters
|
||||
serverCounters metricCounters
|
||||
}
|
||||
|
||||
type machineCheckError struct {
|
||||
id int
|
||||
timestamp string
|
||||
socketID int
|
||||
errorMsg string
|
||||
mciStatusMsg string
|
||||
}
|
||||
|
||||
type metricCounters map[string]int64
|
||||
|
||||
func (*Ras) SampleConfig() string {
|
||||
return sampleConfig
|
||||
}
|
||||
|
||||
// Start initializes connection to DB, metrics are gathered in Gather
|
||||
func (r *Ras) Start(telegraf.Accumulator) error {
|
||||
err := validateDBPath(r.DBPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
r.db, err = connectToDB(r.DBPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Gather reads the stats provided by RASDaemon and writes it to the Accumulator.
|
||||
func (r *Ras) Gather(acc telegraf.Accumulator) error {
|
||||
rows, err := r.db.Query(mceQuery, r.latestTimestamp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
for rows.Next() {
|
||||
mcError, err := fetchMachineCheckError(rows)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tsErr := r.updateLatestTimestamp(mcError.timestamp)
|
||||
if tsErr != nil {
|
||||
return err
|
||||
}
|
||||
r.updateCounters(mcError)
|
||||
}
|
||||
|
||||
addCPUSocketMetrics(acc, r.cpuSocketCounters)
|
||||
addServerMetrics(acc, r.serverCounters)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop closes any existing DB connection
|
||||
func (r *Ras) Stop() {
|
||||
if r.db != nil {
|
||||
err := r.db.Close()
|
||||
if err != nil {
|
||||
r.Log.Errorf("Error appeared during closing DB (%s): %v", r.DBPath, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateLatestTimestamp(timestamp string) error {
|
||||
ts, err := parseDate(timestamp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if ts.After(r.latestTimestamp) {
|
||||
r.latestTimestamp = ts
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *Ras) updateCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.errorMsg, "No Error") {
|
||||
return
|
||||
}
|
||||
|
||||
r.initializeCPUMetricDataIfRequired(mcError.socketID)
|
||||
r.updateSocketCounters(mcError)
|
||||
r.updateServerCounters(mcError)
|
||||
}
|
||||
|
||||
func newMetricCounters() *metricCounters {
|
||||
return &metricCounters{
|
||||
memoryReadCorrected: 0,
|
||||
memoryReadUncorrected: 0,
|
||||
memoryWriteCorrected: 0,
|
||||
memoryWriteUncorrected: 0,
|
||||
instructionCache: 0,
|
||||
instructionTLB: 0,
|
||||
processorBase: 0,
|
||||
processorBus: 0,
|
||||
internalTimer: 0,
|
||||
smmHandlerCode: 0,
|
||||
internalParity: 0,
|
||||
frc: 0,
|
||||
externalMCEBase: 0,
|
||||
microcodeROMParity: 0,
|
||||
unclassifiedMCEBase: 0,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateServerCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.errorMsg, "CACHE Level-2") && strings.Contains(mcError.errorMsg, "Error") {
|
||||
r.serverCounters[levelTwoCache]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "UPI:") {
|
||||
r.serverCounters[upi]++
|
||||
}
|
||||
}
|
||||
|
||||
func validateDBPath(dbPath string) error {
|
||||
pathInfo, err := os.Stat(dbPath)
|
||||
if os.IsNotExist(err) {
|
||||
return fmt.Errorf("provided db_path does not exist: [%s]", dbPath)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot get system information for db_path file %q: %w", dbPath, err)
|
||||
}
|
||||
|
||||
if mode := pathInfo.Mode(); !mode.IsRegular() {
|
||||
return fmt.Errorf("provided db_path does not point to a regular file: [%s]", dbPath)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func connectToDB(dbPath string) (*sql.DB, error) {
|
||||
return sql.Open("sqlite", dbPath)
|
||||
}
|
||||
|
||||
func (r *Ras) initializeCPUMetricDataIfRequired(socketID int) {
|
||||
if _, ok := r.cpuSocketCounters[socketID]; !ok {
|
||||
r.cpuSocketCounters[socketID] = *newMetricCounters()
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateSocketCounters(mcError *machineCheckError) {
|
||||
r.updateMemoryCounters(mcError)
|
||||
r.updateProcessorBaseCounters(mcError)
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "Instruction TLB") && strings.Contains(mcError.errorMsg, "Error") {
|
||||
r.cpuSocketCounters[mcError.socketID][instructionTLB]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "BUS") && strings.Contains(mcError.errorMsg, "Error") {
|
||||
r.cpuSocketCounters[mcError.socketID][processorBus]++
|
||||
}
|
||||
|
||||
if (strings.Contains(mcError.errorMsg, "CACHE Level-0") ||
|
||||
strings.Contains(mcError.errorMsg, "CACHE Level-1")) &&
|
||||
strings.Contains(mcError.errorMsg, "Error") {
|
||||
r.cpuSocketCounters[mcError.socketID][instructionCache]++
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateProcessorBaseCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.errorMsg, "Internal Timer error") {
|
||||
r.cpuSocketCounters[mcError.socketID][internalTimer]++
|
||||
r.cpuSocketCounters[mcError.socketID][processorBase]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "SMM Handler Code Access Violation") {
|
||||
r.cpuSocketCounters[mcError.socketID][smmHandlerCode]++
|
||||
r.cpuSocketCounters[mcError.socketID][processorBase]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "Internal parity error") {
|
||||
r.cpuSocketCounters[mcError.socketID][internalParity]++
|
||||
r.cpuSocketCounters[mcError.socketID][processorBase]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "FRC error") {
|
||||
r.cpuSocketCounters[mcError.socketID][frc]++
|
||||
r.cpuSocketCounters[mcError.socketID][processorBase]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "External error") {
|
||||
r.cpuSocketCounters[mcError.socketID][externalMCEBase]++
|
||||
r.cpuSocketCounters[mcError.socketID][processorBase]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "Microcode ROM parity error") {
|
||||
r.cpuSocketCounters[mcError.socketID][microcodeROMParity]++
|
||||
r.cpuSocketCounters[mcError.socketID][processorBase]++
|
||||
}
|
||||
|
||||
if strings.Contains(mcError.errorMsg, "Unclassified") || strings.Contains(mcError.errorMsg, "Internal unclassified") {
|
||||
r.cpuSocketCounters[mcError.socketID][unclassifiedMCEBase]++
|
||||
r.cpuSocketCounters[mcError.socketID][processorBase]++
|
||||
}
|
||||
}
|
||||
|
||||
func (r *Ras) updateMemoryCounters(mcError *machineCheckError) {
|
||||
if strings.Contains(mcError.errorMsg, "Memory read error") {
|
||||
if strings.Contains(mcError.mciStatusMsg, "Corrected_error") {
|
||||
r.cpuSocketCounters[mcError.socketID][memoryReadCorrected]++
|
||||
} else {
|
||||
r.cpuSocketCounters[mcError.socketID][memoryReadUncorrected]++
|
||||
}
|
||||
}
|
||||
if strings.Contains(mcError.errorMsg, "Memory write error") {
|
||||
if strings.Contains(mcError.mciStatusMsg, "Corrected_error") {
|
||||
r.cpuSocketCounters[mcError.socketID][memoryWriteCorrected]++
|
||||
} else {
|
||||
r.cpuSocketCounters[mcError.socketID][memoryWriteUncorrected]++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func addCPUSocketMetrics(acc telegraf.Accumulator, cpuSocketCounters map[int]metricCounters) {
|
||||
for socketID, data := range cpuSocketCounters {
|
||||
tags := map[string]string{
|
||||
"socket_id": strconv.Itoa(socketID),
|
||||
}
|
||||
fields := make(map[string]interface{})
|
||||
|
||||
for errorName, count := range data {
|
||||
fields[errorName] = count
|
||||
}
|
||||
|
||||
acc.AddCounter("ras", fields, tags)
|
||||
}
|
||||
}
|
||||
|
||||
func addServerMetrics(acc telegraf.Accumulator, counters map[string]int64) {
|
||||
fields := make(map[string]interface{})
|
||||
for errorName, count := range counters {
|
||||
fields[errorName] = count
|
||||
}
|
||||
|
||||
acc.AddCounter("ras", fields, make(map[string]string))
|
||||
}
|
||||
|
||||
func fetchMachineCheckError(rows *sql.Rows) (*machineCheckError, error) {
|
||||
mcError := &machineCheckError{}
|
||||
err := rows.Scan(&mcError.id, &mcError.timestamp, &mcError.errorMsg, &mcError.mciStatusMsg, &mcError.socketID)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return mcError, nil
|
||||
}
|
||||
|
||||
func parseDate(date string) (time.Time, error) {
|
||||
return time.Parse(dateLayout, date)
|
||||
}
|
||||
|
||||
func init() {
|
||||
inputs.Add("ras", func() telegraf.Input {
|
||||
//nolint:errcheck // known timestamp
|
||||
defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700")
|
||||
return &Ras{
|
||||
DBPath: defaultDBPath,
|
||||
latestTimestamp: defaultTimestamp,
|
||||
cpuSocketCounters: map[int]metricCounters{
|
||||
0: *newMetricCounters(),
|
||||
},
|
||||
serverCounters: map[string]int64{
|
||||
levelTwoCache: 0,
|
||||
upi: 0,
|
||||
},
|
||||
}
|
||||
})
|
||||
}
|
33
plugins/inputs/ras/ras_notlinux.go
Normal file
33
plugins/inputs/ras/ras_notlinux.go
Normal file
|
@ -0,0 +1,33 @@
|
|||
//go:generate ../../../tools/readme_config_includer/generator
|
||||
//go:build !linux || (linux && !386 && !amd64 && !arm && !arm64)
|
||||
|
||||
package ras
|
||||
|
||||
import (
|
||||
_ "embed"
|
||||
|
||||
"github.com/influxdata/telegraf"
|
||||
"github.com/influxdata/telegraf/plugins/inputs"
|
||||
)
|
||||
|
||||
//go:embed sample.conf
|
||||
var sampleConfig string
|
||||
|
||||
type Ras struct {
|
||||
Log telegraf.Logger `toml:"-"`
|
||||
}
|
||||
|
||||
func (*Ras) SampleConfig() string { return sampleConfig }
|
||||
|
||||
func (r *Ras) Init() error {
|
||||
r.Log.Warn("Current platform is not supported")
|
||||
return nil
|
||||
}
|
||||
|
||||
func (*Ras) Gather(telegraf.Accumulator) error { return nil }
|
||||
|
||||
func init() {
|
||||
inputs.Add("ras", func() telegraf.Input {
|
||||
return &Ras{}
|
||||
})
|
||||
}
|
254
plugins/inputs/ras/ras_test.go
Normal file
254
plugins/inputs/ras/ras_test.go
Normal file
|
@ -0,0 +1,254 @@
|
|||
//go:build linux && (386 || amd64 || arm || arm64)
|
||||
|
||||
package ras
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/influxdata/telegraf/testutil"
|
||||
)
|
||||
|
||||
func TestUpdateCounters(t *testing.T) {
|
||||
ras := newRas()
|
||||
for i := range testData {
|
||||
ras.updateCounters(&testData[i])
|
||||
}
|
||||
|
||||
require.Len(t, ras.cpuSocketCounters, 1, "Should contain counters only for single socket")
|
||||
|
||||
for metric, value := range ras.cpuSocketCounters[0] {
|
||||
if metric == processorBase {
|
||||
// processor_base_errors is sum of other seven errors: internal_timer_errors, smm_handler_code_access_violation_errors,
|
||||
// internal_parity_errors, frc_errors, external_mce_errors, microcode_rom_parity_errors and unclassified_mce_errors
|
||||
require.Equal(t, int64(7), value, processorBase+" should have value of 7")
|
||||
} else {
|
||||
require.Equal(t, int64(1), value, metric+" should have value of 1")
|
||||
}
|
||||
}
|
||||
|
||||
for metric, value := range ras.serverCounters {
|
||||
require.Equal(t, int64(1), value, metric+" should have value of 1")
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateLatestTimestamp(t *testing.T) {
|
||||
ras := newRas()
|
||||
ts := "2020-08-01 15:13:27 +0200"
|
||||
testData = append(testData, []machineCheckError{
|
||||
{
|
||||
timestamp: "2019-05-20 08:25:55 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "",
|
||||
mciStatusMsg: "",
|
||||
},
|
||||
{
|
||||
timestamp: "2018-02-21 12:27:22 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "",
|
||||
mciStatusMsg: "",
|
||||
},
|
||||
{
|
||||
timestamp: ts,
|
||||
socketID: 0,
|
||||
errorMsg: "",
|
||||
mciStatusMsg: "",
|
||||
},
|
||||
}...)
|
||||
for _, mce := range testData {
|
||||
err := ras.updateLatestTimestamp(mce.timestamp)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
require.Equal(t, ts, ras.latestTimestamp.Format(dateLayout))
|
||||
}
|
||||
|
||||
func TestMultipleSockets(t *testing.T) {
|
||||
ras := newRas()
|
||||
cacheL2 := "Instruction CACHE Level-2 Generic Error"
|
||||
overflow := "Error_overflow Corrected_error"
|
||||
testData = []machineCheckError{
|
||||
{
|
||||
timestamp: "2019-05-20 08:25:55 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: cacheL2,
|
||||
mciStatusMsg: overflow,
|
||||
},
|
||||
{
|
||||
timestamp: "2018-02-21 12:27:22 +0200",
|
||||
socketID: 1,
|
||||
errorMsg: cacheL2,
|
||||
mciStatusMsg: overflow,
|
||||
},
|
||||
{
|
||||
timestamp: "2020-03-21 14:17:28 +0200",
|
||||
socketID: 2,
|
||||
errorMsg: cacheL2,
|
||||
mciStatusMsg: overflow,
|
||||
},
|
||||
{
|
||||
timestamp: "2020-03-21 17:24:18 +0200",
|
||||
socketID: 3,
|
||||
errorMsg: cacheL2,
|
||||
mciStatusMsg: overflow,
|
||||
},
|
||||
}
|
||||
for i := range testData {
|
||||
ras.updateCounters(&testData[i])
|
||||
}
|
||||
require.Len(t, ras.cpuSocketCounters, 4, "Should contain counters for four sockets")
|
||||
|
||||
for _, metricData := range ras.cpuSocketCounters {
|
||||
for metric, value := range metricData {
|
||||
if metric == levelTwoCache {
|
||||
require.Equal(t, int64(1), value, levelTwoCache+" should have value of 1")
|
||||
} else {
|
||||
require.Equal(t, int64(0), value, metric+" should have value of 0")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingDatabase(t *testing.T) {
|
||||
var acc testutil.Accumulator
|
||||
ras := newRas()
|
||||
ras.DBPath = "/nonexistent/ras.db"
|
||||
err := ras.Start(&acc)
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestEmptyDatabase(t *testing.T) {
|
||||
ras := newRas()
|
||||
|
||||
require.Len(t, ras.cpuSocketCounters, 1, "Should contain default counters for one socket")
|
||||
require.Len(t, ras.serverCounters, 2, "Should contain default counters for server")
|
||||
|
||||
for metric, value := range ras.cpuSocketCounters[0] {
|
||||
require.Equal(t, int64(0), value, metric+" should have value of 0")
|
||||
}
|
||||
|
||||
for metric, value := range ras.serverCounters {
|
||||
require.Equal(t, int64(0), value, metric+" should have value of 0")
|
||||
}
|
||||
}
|
||||
|
||||
func newRas() *Ras {
|
||||
//nolint:errcheck // known timestamp
|
||||
defaultTimestamp, _ := parseDate("1970-01-01 00:00:01 -0700")
|
||||
return &Ras{
|
||||
DBPath: defaultDBPath,
|
||||
latestTimestamp: defaultTimestamp,
|
||||
cpuSocketCounters: map[int]metricCounters{
|
||||
0: *newMetricCounters(),
|
||||
},
|
||||
serverCounters: map[string]int64{
|
||||
levelTwoCache: 0,
|
||||
upi: 0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
var testData = []machineCheckError{
|
||||
{
|
||||
timestamp: "2020-05-20 07:34:53 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 07:35:11 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "MEMORY CONTROLLER RD_CHANNEL0_ERR Transaction: Memory read error",
|
||||
mciStatusMsg: "Uncorrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 07:37:50 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "MEMORY CONTROLLER RD_CHANNEL2_ERR Transaction: Memory write error",
|
||||
mciStatusMsg: "Uncorrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:14:51 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "MEMORY CONTROLLER WR_CHANNEL2_ERR Transaction: Memory write error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:15:31 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "corrected filtering (some unreported errors in same region) Instruction CACHE Level-0 Read Error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:16:32 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "Instruction TLB Level-0 Error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:16:56 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "No Error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:17:24 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "Unclassified",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:17:41 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "Microcode ROM parity error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:17:48 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "FRC error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:18:18 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "Internal parity error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:18:34 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "SMM Handler Code Access Violation",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:18:54 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "Internal Timer error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:21:23 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "BUS Level-3 Generic Generic IO Request-did-not-timeout Error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:23:23 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "External error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:25:31 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "UPI: COR LL Rx detected CRC error - successful LLR without Phy Reinit",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
{
|
||||
timestamp: "2020-05-20 08:25:55 +0200",
|
||||
socketID: 0,
|
||||
errorMsg: "Instruction CACHE Level-2 Generic Error",
|
||||
mciStatusMsg: "Error_overflow Corrected_error",
|
||||
},
|
||||
}
|
6
plugins/inputs/ras/sample.conf
Normal file
6
plugins/inputs/ras/sample.conf
Normal file
|
@ -0,0 +1,6 @@
|
|||
# RAS plugin exposes counter metrics for Machine Check Errors provided by RASDaemon (sqlite3 output is required).
|
||||
# This plugin ONLY supports Linux on 386, amd64, arm, and arm64
|
||||
[[inputs.ras]]
|
||||
## Optional path to RASDaemon sqlite3 database.
|
||||
## Default: /var/lib/rasdaemon/ras-mc_event.db
|
||||
# db_path = ""
|
Loading…
Add table
Add a link
Reference in a new issue