Skip to content

Commit

Permalink
feat(inputs.infiniband_hw): add a new input plugin for Infiniband car…
Browse files Browse the repository at this point in the history
…d/ports HW statistics
  • Loading branch information
izekr authored Dec 19, 2024
1 parent d829a5b commit 09335fb
Show file tree
Hide file tree
Showing 7 changed files with 360 additions and 0 deletions.
5 changes: 5 additions & 0 deletions plugins/inputs/all/infiniband_hw.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
//go:build !custom || inputs || inputs.infiniband_hw

package all

import _ "github.com/influxdata/telegraf/plugins/inputs/infiniband_hw" // register plugin
75 changes: 75 additions & 0 deletions plugins/inputs/infiniband_hw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# InfiniBand Hardware Input Plugin

This plugin gathers statistics for all InfiniBand devices and ports on the
system. These are the hardware counters that can be found in
`/sys/class/infiniband/<dev>/ports/<port>/hw_counters/`

**Supported Platforms**: Linux

## Global configuration options <!-- @/docs/includes/plugin_config.md -->

In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.

[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins

## Configuration

```toml @sample.conf
# Gets hardware counters from all InfiniBand cards and ports installed
# This plugin ONLY supports Linux
[[inputs.infiniband_hw]]
# no configuration
```

## Metrics

Actual metrics depend on the InfiniBand devices, the plugin uses a simple
mapping from hw_counter -> hw_counter value.

[Information about hw_counters][hw_counters] collected is provided by Nvidia.

[hw_counters]: https://enterprise-support.nvidia.com/s/article/understanding-mlx5-linux-counters-and-status-parameters

- infiniband
- tags:
- device
- port
- fields:
- duplicate_request (integer)
- implied_nak_seq_err (integer)
- lifespan (integer)
- local_ack_timeout_err (integer)
- np_cnp_sent (integer)
- np_ecn_marked_roce_packets (integer)
- out_of_buffer (integer)
- out_of_sequence (integer)
- packet_seq_err (integer)
- req_cqe_error (integer)
- req_cqe_flush_error (integer)
- req_remote_access_errors (integer)
- req_remote_invalid_request (integer)
- resp_cqe_error (integer)
- resp_cqe_flush_error (integer)
- resp_local_length_error (integer)
- resp_remote_access_errors (integer)
- rnr_nak_retry_err (integer)
- roce_adp_retrans (integer)
- roce_adp_retrans_to (integer)
- roce_slow_restart (integer)
- roce_slow_restart_cnps (integer)
- roce_slow_restart_trans (integer)
- rp_cnp_handled (integer)
- rp_cnp_ignored (integer)
- rx_atomic_requests (integer)
- rx_icrc_encapsulated (integer)
- rx_read_requests (integer)
- rx_write_requests (integer)

## Example Output

```text
infiniband_hw,device=mlx5_4,host=host1,port=1 local_ack_timeout_err=0i,req_cqe_error=0i,roce_slow_restart=0i,roce_adp_retrans=0i,rx_atomic_requests=0i,np_ecn_marked_roce_packets=0i,rp_cnp_handled=0i,req_remote_access_errors=0i,np_cnp_sent=0i,resp_cqe_error=0i,out_of_sequence=0i,roce_slow_restart_cnps=0i,req_remote_invalid_request=0i,implied_nak_seq_err=0i,rp_cnp_ignored=0i,resp_local_length_error=0i,lifespan=10i,out_of_buffer=0i,rx_write_requests=0i,resp_cqe_flush_error=0i,rx_icrc_encapsulated=0i,rx_read_requests=0i,resp_remote_access_errors=0i,roce_adp_retrans_to=0i,roce_slow_restart_trans=0i,rnr_nak_retry_err=0i,req_cqe_flush_error=0i,packet_seq_err=0i,duplicate_request=0i 1734520190000000000
```
25 changes: 25 additions & 0 deletions plugins/inputs/infiniband_hw/infiniband_hw.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//go:generate ../../../tools/readme_config_includer/generator
package infiniband_hw

import (
_ "embed"

"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
)

//go:embed sample.conf
var sampleConfig string

type InfinibandHW struct {
Log telegraf.Logger `toml:"-"`
}

func (*InfinibandHW) SampleConfig() string {
return sampleConfig
}

// Initialise plugin
func init() {
inputs.Add("infiniband_hw", func() telegraf.Input { return &InfinibandHW{} })
}
53 changes: 53 additions & 0 deletions plugins/inputs/infiniband_hw/infiniband_hw_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//go:build linux

package infiniband_hw

import (
"errors"
"strconv"

"github.com/Mellanox/rdmamap"

"github.com/influxdata/telegraf"
)

// Gather hardware statistics from our infiniband cards
func (i *InfinibandHW) Gather(acc telegraf.Accumulator) error {
rdmaDevices := rdmamap.GetRdmaDeviceList()

if len(rdmaDevices) == 0 {
return errors.New("no InfiniBand devices found in /sys/class/infiniband/")
}

for _, dev := range rdmaDevices {
devicePorts := rdmamap.GetPorts(dev)
for _, port := range devicePorts {
portInt, err := strconv.Atoi(port)
if err != nil {
return err
}

stats, err := rdmamap.GetRdmaSysfsHwStats(dev, portInt)
if err != nil {
continue
}

addStats(dev, port, stats, acc)
}
}

return nil
}

// Add the statistics to the accumulator
func addStats(dev, port string, stats []rdmamap.RdmaStatEntry, acc telegraf.Accumulator) {
// Allow users to filter by card and port
tags := map[string]string{"device": dev, "port": port}
fields := make(map[string]interface{})

for _, entry := range stats {
fields[entry.Name] = entry.Value
}

acc.AddFields("infiniband_hw", fields, tags)
}
23 changes: 23 additions & 0 deletions plugins/inputs/infiniband_hw/infiniband_hw_notlinux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//go:build !linux

package infiniband_hw

import (
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/plugins/inputs"
)

func (i *InfinibandHW) Init() error {
i.Log.Warn("Current platform is not supported")
return nil
}

func (*InfinibandHW) Gather(_ telegraf.Accumulator) error {
return nil
}

func init() {
inputs.Add("infiniband_hw", func() telegraf.Input {
return &InfinibandHW{}
})
}
175 changes: 175 additions & 0 deletions plugins/inputs/infiniband_hw/infiniband_hw_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
//go:build linux

package infiniband_hw

import (
"testing"

"github.com/Mellanox/rdmamap"

"github.com/influxdata/telegraf/testutil"
)

func TestInfinibandHw(t *testing.T) {
fields := map[string]interface{}{
"duplicate_request": uint64(0),
"implied_nak_seq_err": uint64(0),
"lifespan": uint64(10),
"local_ack_timeout_err": uint64(38),
"np_cnp_sent": uint64(10284520),
"np_ecn_marked_roce_packets": uint64(286733949),
"out_of_buffer": uint64(1149772),
"out_of_sequence": uint64(44),
"packet_seq_err": uint64(1),
"req_cqe_error": uint64(10776),
"req_cqe_flush_error": uint64(2173),
"req_remote_access_errors": uint64(0),
"req_remote_invalid_request": uint64(0),
"resp_cqe_error": uint64(759),
"resp_cqe_flush_error": uint64(759),
"resp_local_length_error": uint64(0),
"resp_remote_access_errors": uint64(0),
"rnr_nak_retry_err": uint64(0),
"roce_adp_retrans": uint64(0),
"roce_adp_retrans_to": uint64(0),
"roce_slow_restart": uint64(0),
"roce_slow_restart_cnps": uint64(0),
"roce_slow_restart_trans": uint64(0),
"rp_cnp_handled": uint64(1),
"rp_cnp_ignored": uint64(0),
"rx_atomic_requests": uint64(0),
"rx_icrc_encapsulated": uint64(0),
"rx_read_requests": uint64(488228),
"rx_write_requests": uint64(3928699),
}

tags := map[string]string{
"device": "m1x5_0",
"port": "1",
}

sampleRdmaHwstatsEntries := []rdmamap.RdmaStatEntry{
{
Name: "duplicate_request",
Value: uint64(0),
},
{
Name: "implied_nak_seq_err",
Value: uint64(0),
},
{
Name: "lifespan",
Value: uint64(10),
},
{
Name: "local_ack_timeout_err",
Value: uint64(38),
},
{
Name: "np_cnp_sent",
Value: uint64(10284520),
},
{
Name: "np_ecn_marked_roce_packets",
Value: uint64(286733949),
},
{
Name: "out_of_buffer",
Value: uint64(1149772),
},
{
Name: "out_of_sequence",
Value: uint64(44),
},
{
Name: "packet_seq_err",
Value: uint64(1),
},
{
Name: "req_cqe_error",
Value: uint64(10776),
},
{
Name: "req_cqe_flush_error",
Value: uint64(2173),
},
{
Name: "req_remote_access_errors",
Value: uint64(0),
},
{
Name: "req_remote_invalid_request",
Value: uint64(0),
},
{
Name: "resp_cqe_error",
Value: uint64(759),
},
{
Name: "resp_cqe_flush_error",
Value: uint64(759),
},
{
Name: "resp_local_length_error",
Value: uint64(0),
},
{
Name: "resp_remote_access_errors",
Value: uint64(0),
},
{
Name: "rnr_nak_retry_err",
Value: uint64(0),
},
{
Name: "roce_adp_retrans",
Value: uint64(0),
},
{
Name: "roce_adp_retrans_to",
Value: uint64(0),
},
{
Name: "roce_slow_restart",
Value: uint64(0),
},
{
Name: "roce_slow_restart_cnps",
Value: uint64(0),
},
{
Name: "roce_slow_restart_trans",
Value: uint64(0),
},
{
Name: "rp_cnp_handled",
Value: uint64(1),
},
{
Name: "rp_cnp_ignored",
Value: uint64(0),
},
{
Name: "rx_atomic_requests",
Value: uint64(0),
},
{
Name: "rx_icrc_encapsulated",
Value: uint64(0),
},
{
Name: "rx_read_requests",
Value: uint64(488228),
},
{
Name: "rx_write_requests",
Value: uint64(3928699),
},
}

var acc testutil.Accumulator

addStats("m1x5_0", "1", sampleRdmaHwstatsEntries, &acc)

acc.AssertContainsTaggedFields(t, "infiniband_hw", fields, tags)
}
4 changes: 4 additions & 0 deletions plugins/inputs/infiniband_hw/sample.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Gets hardware counters from all InfiniBand cards and ports installed
# This plugin ONLY supports Linux
[[inputs.infiniband_hw]]
# no configuration

0 comments on commit 09335fb

Please sign in to comment.