-
Notifications
You must be signed in to change notification settings - Fork 5.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(inputs.infiniband_hw): add a new input plugin for Infiniband car…
…d/ports HW statistics
- Loading branch information
Showing
7 changed files
with
360 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
//go:build !custom || inputs || inputs.infiniband_hw | ||
|
||
package all | ||
|
||
import _ "github.com/influxdata/telegraf/plugins/inputs/infiniband_hw" // register plugin |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# InfiniBand Hardware Input Plugin | ||
|
||
This plugin gathers statistics for all InfiniBand devices and ports on the | ||
system. These are the hardware counters that can be found in | ||
`/sys/class/infiniband/<dev>/ports/<port>/hw_counters/` | ||
|
||
**Supported Platforms**: Linux | ||
|
||
## Global configuration options <!-- @/docs/includes/plugin_config.md --> | ||
|
||
In addition to the plugin-specific configuration settings, plugins support | ||
additional global and plugin configuration settings. These settings are used to | ||
modify metrics, tags, and field or create aliases and configure ordering, etc. | ||
See the [CONFIGURATION.md][CONFIGURATION.md] for more details. | ||
|
||
[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins | ||
|
||
## Configuration | ||
|
||
```toml @sample.conf | ||
# Gets hardware counters from all InfiniBand cards and ports installed | ||
# This plugin ONLY supports Linux | ||
[[inputs.infiniband_hw]] | ||
# no configuration | ||
``` | ||
|
||
## Metrics | ||
|
||
Actual metrics depend on the InfiniBand devices, the plugin uses a simple | ||
mapping from hw_counter -> hw_counter value. | ||
|
||
[Information about hw_counters][hw_counters] collected is provided by Nvidia. | ||
|
||
[hw_counters]: https://enterprise-support.nvidia.com/s/article/understanding-mlx5-linux-counters-and-status-parameters | ||
|
||
- infiniband | ||
- tags: | ||
- device | ||
- port | ||
- fields: | ||
- duplicate_request (integer) | ||
- implied_nak_seq_err (integer) | ||
- lifespan (integer) | ||
- local_ack_timeout_err (integer) | ||
- np_cnp_sent (integer) | ||
- np_ecn_marked_roce_packets (integer) | ||
- out_of_buffer (integer) | ||
- out_of_sequence (integer) | ||
- packet_seq_err (integer) | ||
- req_cqe_error (integer) | ||
- req_cqe_flush_error (integer) | ||
- req_remote_access_errors (integer) | ||
- req_remote_invalid_request (integer) | ||
- resp_cqe_error (integer) | ||
- resp_cqe_flush_error (integer) | ||
- resp_local_length_error (integer) | ||
- resp_remote_access_errors (integer) | ||
- rnr_nak_retry_err (integer) | ||
- roce_adp_retrans (integer) | ||
- roce_adp_retrans_to (integer) | ||
- roce_slow_restart (integer) | ||
- roce_slow_restart_cnps (integer) | ||
- roce_slow_restart_trans (integer) | ||
- rp_cnp_handled (integer) | ||
- rp_cnp_ignored (integer) | ||
- rx_atomic_requests (integer) | ||
- rx_icrc_encapsulated (integer) | ||
- rx_read_requests (integer) | ||
- rx_write_requests (integer) | ||
|
||
## Example Output | ||
|
||
```text | ||
infiniband_hw,device=mlx5_4,host=host1,port=1 local_ack_timeout_err=0i,req_cqe_error=0i,roce_slow_restart=0i,roce_adp_retrans=0i,rx_atomic_requests=0i,np_ecn_marked_roce_packets=0i,rp_cnp_handled=0i,req_remote_access_errors=0i,np_cnp_sent=0i,resp_cqe_error=0i,out_of_sequence=0i,roce_slow_restart_cnps=0i,req_remote_invalid_request=0i,implied_nak_seq_err=0i,rp_cnp_ignored=0i,resp_local_length_error=0i,lifespan=10i,out_of_buffer=0i,rx_write_requests=0i,resp_cqe_flush_error=0i,rx_icrc_encapsulated=0i,rx_read_requests=0i,resp_remote_access_errors=0i,roce_adp_retrans_to=0i,roce_slow_restart_trans=0i,rnr_nak_retry_err=0i,req_cqe_flush_error=0i,packet_seq_err=0i,duplicate_request=0i 1734520190000000000 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
//go:generate ../../../tools/readme_config_includer/generator | ||
package infiniband_hw | ||
|
||
import ( | ||
_ "embed" | ||
|
||
"github.com/influxdata/telegraf" | ||
"github.com/influxdata/telegraf/plugins/inputs" | ||
) | ||
|
||
//go:embed sample.conf | ||
var sampleConfig string | ||
|
||
type InfinibandHW struct { | ||
Log telegraf.Logger `toml:"-"` | ||
} | ||
|
||
func (*InfinibandHW) SampleConfig() string { | ||
return sampleConfig | ||
} | ||
|
||
// Initialise plugin | ||
func init() { | ||
inputs.Add("infiniband_hw", func() telegraf.Input { return &InfinibandHW{} }) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
//go:build linux | ||
|
||
package infiniband_hw | ||
|
||
import ( | ||
"errors" | ||
"strconv" | ||
|
||
"github.com/Mellanox/rdmamap" | ||
|
||
"github.com/influxdata/telegraf" | ||
) | ||
|
||
// Gather hardware statistics from our infiniband cards | ||
func (i *InfinibandHW) Gather(acc telegraf.Accumulator) error { | ||
rdmaDevices := rdmamap.GetRdmaDeviceList() | ||
|
||
if len(rdmaDevices) == 0 { | ||
return errors.New("no InfiniBand devices found in /sys/class/infiniband/") | ||
} | ||
|
||
for _, dev := range rdmaDevices { | ||
devicePorts := rdmamap.GetPorts(dev) | ||
for _, port := range devicePorts { | ||
portInt, err := strconv.Atoi(port) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
stats, err := rdmamap.GetRdmaSysfsHwStats(dev, portInt) | ||
if err != nil { | ||
continue | ||
} | ||
|
||
addStats(dev, port, stats, acc) | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// Add the statistics to the accumulator | ||
func addStats(dev, port string, stats []rdmamap.RdmaStatEntry, acc telegraf.Accumulator) { | ||
// Allow users to filter by card and port | ||
tags := map[string]string{"device": dev, "port": port} | ||
fields := make(map[string]interface{}) | ||
|
||
for _, entry := range stats { | ||
fields[entry.Name] = entry.Value | ||
} | ||
|
||
acc.AddFields("infiniband_hw", fields, tags) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
//go:build !linux | ||
|
||
package infiniband_hw | ||
|
||
import ( | ||
"github.com/influxdata/telegraf" | ||
"github.com/influxdata/telegraf/plugins/inputs" | ||
) | ||
|
||
func (i *InfinibandHW) Init() error { | ||
i.Log.Warn("Current platform is not supported") | ||
return nil | ||
} | ||
|
||
func (*InfinibandHW) Gather(_ telegraf.Accumulator) error { | ||
return nil | ||
} | ||
|
||
func init() { | ||
inputs.Add("infiniband_hw", func() telegraf.Input { | ||
return &InfinibandHW{} | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
//go:build linux | ||
|
||
package infiniband_hw | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/Mellanox/rdmamap" | ||
|
||
"github.com/influxdata/telegraf/testutil" | ||
) | ||
|
||
func TestInfinibandHw(t *testing.T) { | ||
fields := map[string]interface{}{ | ||
"duplicate_request": uint64(0), | ||
"implied_nak_seq_err": uint64(0), | ||
"lifespan": uint64(10), | ||
"local_ack_timeout_err": uint64(38), | ||
"np_cnp_sent": uint64(10284520), | ||
"np_ecn_marked_roce_packets": uint64(286733949), | ||
"out_of_buffer": uint64(1149772), | ||
"out_of_sequence": uint64(44), | ||
"packet_seq_err": uint64(1), | ||
"req_cqe_error": uint64(10776), | ||
"req_cqe_flush_error": uint64(2173), | ||
"req_remote_access_errors": uint64(0), | ||
"req_remote_invalid_request": uint64(0), | ||
"resp_cqe_error": uint64(759), | ||
"resp_cqe_flush_error": uint64(759), | ||
"resp_local_length_error": uint64(0), | ||
"resp_remote_access_errors": uint64(0), | ||
"rnr_nak_retry_err": uint64(0), | ||
"roce_adp_retrans": uint64(0), | ||
"roce_adp_retrans_to": uint64(0), | ||
"roce_slow_restart": uint64(0), | ||
"roce_slow_restart_cnps": uint64(0), | ||
"roce_slow_restart_trans": uint64(0), | ||
"rp_cnp_handled": uint64(1), | ||
"rp_cnp_ignored": uint64(0), | ||
"rx_atomic_requests": uint64(0), | ||
"rx_icrc_encapsulated": uint64(0), | ||
"rx_read_requests": uint64(488228), | ||
"rx_write_requests": uint64(3928699), | ||
} | ||
|
||
tags := map[string]string{ | ||
"device": "m1x5_0", | ||
"port": "1", | ||
} | ||
|
||
sampleRdmaHwstatsEntries := []rdmamap.RdmaStatEntry{ | ||
{ | ||
Name: "duplicate_request", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "implied_nak_seq_err", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "lifespan", | ||
Value: uint64(10), | ||
}, | ||
{ | ||
Name: "local_ack_timeout_err", | ||
Value: uint64(38), | ||
}, | ||
{ | ||
Name: "np_cnp_sent", | ||
Value: uint64(10284520), | ||
}, | ||
{ | ||
Name: "np_ecn_marked_roce_packets", | ||
Value: uint64(286733949), | ||
}, | ||
{ | ||
Name: "out_of_buffer", | ||
Value: uint64(1149772), | ||
}, | ||
{ | ||
Name: "out_of_sequence", | ||
Value: uint64(44), | ||
}, | ||
{ | ||
Name: "packet_seq_err", | ||
Value: uint64(1), | ||
}, | ||
{ | ||
Name: "req_cqe_error", | ||
Value: uint64(10776), | ||
}, | ||
{ | ||
Name: "req_cqe_flush_error", | ||
Value: uint64(2173), | ||
}, | ||
{ | ||
Name: "req_remote_access_errors", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "req_remote_invalid_request", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "resp_cqe_error", | ||
Value: uint64(759), | ||
}, | ||
{ | ||
Name: "resp_cqe_flush_error", | ||
Value: uint64(759), | ||
}, | ||
{ | ||
Name: "resp_local_length_error", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "resp_remote_access_errors", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "rnr_nak_retry_err", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "roce_adp_retrans", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "roce_adp_retrans_to", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "roce_slow_restart", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "roce_slow_restart_cnps", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "roce_slow_restart_trans", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "rp_cnp_handled", | ||
Value: uint64(1), | ||
}, | ||
{ | ||
Name: "rp_cnp_ignored", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "rx_atomic_requests", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "rx_icrc_encapsulated", | ||
Value: uint64(0), | ||
}, | ||
{ | ||
Name: "rx_read_requests", | ||
Value: uint64(488228), | ||
}, | ||
{ | ||
Name: "rx_write_requests", | ||
Value: uint64(3928699), | ||
}, | ||
} | ||
|
||
var acc testutil.Accumulator | ||
|
||
addStats("m1x5_0", "1", sampleRdmaHwstatsEntries, &acc) | ||
|
||
acc.AssertContainsTaggedFields(t, "infiniband_hw", fields, tags) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Gets hardware counters from all InfiniBand cards and ports installed | ||
# This plugin ONLY supports Linux | ||
[[inputs.infiniband_hw]] | ||
# no configuration |