Skip to content

Commit

Permalink
feat: Improve ResourceManager UX (#9338)
Browse files Browse the repository at this point in the history
This PR adds several new functionalities to make easier the usage of ResourceManager:

- Now resource manager logs when resources are exceeded are on ERROR instead of warning.
- The resources exceeded error now shows what kind of limit was reached and the scope.
- When there was no limit exceeded, we print a message for the user saying that limits are not exceeded anymore.
- Added `swarm limit all` command to show all set limits with the same format as `swarm stats all`
- Added `min-used-limit-perc` option to `swarm stats all` to only show stats that are above a specific percentage
- Simplify a lot default values.
- **Enable ResourceManager by default.**

Output example:
```
2022-11-09T10:51:40.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:51:50.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 483095 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:51:50.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:52:00.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 455294 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:00.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:52:10.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 471384 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:10.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:52:20.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 8 times with error "peer:12D3KooWKqcaBtcmZKLKCCoDPBuA6AXGJMNrLQUPPMsA5Q6D1eG6: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:20.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 192 times with error "peer:12D3KooWPjetWPGQUih9LZTGHdyAM9fKaXtUxDyBhA93E3JAWCXj: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:20.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 469746 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:20.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:52:30.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 484137 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:30.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 29 times with error "peer:12D3KooWPjetWPGQUih9LZTGHdyAM9fKaXtUxDyBhA93E3JAWCXj: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:30.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:52:40.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 468843 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:40.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:52:50.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 366638 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:52:50.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:53:00.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 405526 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:53:00.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 107 times with error "peer:12D3KooWQZQCwevTDGhkE9iGYk5sBzWRDUSX68oyrcfM9tXyrs2Q: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:53:00.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:53:10.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 336923 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:53:10.566+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:53:20.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:55      Resource limits were exceeded 71 times with error "transient: cannot reserve inbound stream: resource limit exceeded".
2022-11-09T10:53:20.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:59      Consider inspecting logs and raising the resource manager limits. Documentation: https://github.com/ipfs/kubo/blob/master/docs/config.md#swarmresourcemgr
2022-11-09T10:53:30.565+0100    ERROR   resourcemanager libp2p/rcmgr_logging.go:64      Resrouce limits are no longer being exceeded.

```
## Validation tests

- Accelerated DHT client runs with no errors when ResourceManager is active. No problems were observed.
- Running an attack with 200 connections and 1M streams using yamux protocol. Node was usable during the attack. With ResourceManager deactivated, the node was killed by the OS because of the amount of memory consumed.
	- Actions done when the attack was active:
		- Add files 
		- Force a reprovide
		- Use the gateway to resolve an IPNS address.

It closes #9001 
It closes #9351
It closes #9322
  • Loading branch information
ajnavarro authored Nov 10, 2022
1 parent 72262a8 commit 254d81a
Show file tree
Hide file tree
Showing 17 changed files with 482 additions and 905 deletions.
1 change: 0 additions & 1 deletion .circleci/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ default_environment: &default_environment
CIRCLE_TEST_REPORTS: /tmp/circleci-test-results
CIRCLE_ARTIFACTS: /tmp/circleci-artifacts
GIT_PAGER: cat
IPFS_CHECK_RCMGR_DEFAULTS: 1

executors:
golang:
Expand Down
4 changes: 4 additions & 0 deletions config/swarm.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,10 @@ type ResourceMgr struct {
// Enables the Network Resource Manager feature, default to on.
Enabled Flag `json:",omitempty"`
Limits *rcmgr.LimitConfig `json:",omitempty"`

MaxMemory OptionalString `json:",omitempty"`
MaxFileDescriptors OptionalInteger `json:",omitempty"`

// A list of multiaddrs that can bypass normal system limits (but are still
// limited by the allowlist scope). Convenience config around
// https://pkg.go.dev/github.com/libp2p/go-libp2p/p2p/host/resource-manager#Allowlist.Add
Expand Down
33 changes: 20 additions & 13 deletions core/commands/swarm.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ ipfs peers in the internet.
}

const (
swarmVerboseOptionName = "verbose"
swarmStreamsOptionName = "streams"
swarmLatencyOptionName = "latency"
swarmDirectionOptionName = "direction"
swarmResetLimitsOptionName = "reset"
swarmVerboseOptionName = "verbose"
swarmStreamsOptionName = "streams"
swarmLatencyOptionName = "latency"
swarmDirectionOptionName = "direction"
swarmResetLimitsOptionName = "reset"
swarmUsedResourcesPercentageName = "min-used-limit-perc"
)

type peeringResult struct {
Expand Down Expand Up @@ -340,6 +341,9 @@ The output of this command is JSON.
Arguments: []cmds.Argument{
cmds.StringArg("scope", true, false, "scope of the stat report"),
},
Options: []cmds.Option{
cmds.IntOption(swarmUsedResourcesPercentageName, "Display only resources that are using above the specified percentage"),
},
Run: func(req *cmds.Request, res cmds.ResponseEmitter, env cmds.Environment) error {
node, err := cmdenv.GetNode(env)
if err != nil {
Expand All @@ -353,8 +357,10 @@ The output of this command is JSON.
if len(req.Arguments) != 1 {
return fmt.Errorf("must specify exactly one scope")
}

percentage, _ := req.Options[swarmUsedResourcesPercentageName].(int)
scope := req.Arguments[0]
result, err := libp2p.NetStat(node.ResourceManager, scope)
result, err := libp2p.NetStat(node.ResourceManager, scope, percentage)
if err != nil {
return err
}
Expand All @@ -378,6 +384,7 @@ var swarmLimitCmd = &cmds.Command{
Tagline: "Get or set resource limits for a scope.",
LongDescription: `Get or set resource limits for a scope.
The scope can be one of the following:
- all -- all limits actually being applied.
- system -- limits for the system aggregate resource usage.
- transient -- limits for the transient resource usage.
- svc:<service> -- limits for the resource usage of a specific service.
Expand Down Expand Up @@ -435,19 +442,19 @@ Changes made via command line are persisted in the Swarm.ResourceMgr.Limits fiel
}
}

var result rcmgr.BaseLimit
var result interface{}
_, reset := req.Options[swarmResetLimitsOptionName]
if reset {
result, err = libp2p.NetResetLimit(node.ResourceManager, node.Repo, scope)
if err != nil {
return err
}
} else if scope == "all" {
result, err = libp2p.NetLimitAll(node.ResourceManager)
} else {
// get scope limit
result, err = libp2p.NetLimit(node.ResourceManager, scope)
if err != nil {
return err
}
}

if err != nil {
return err
}

b := new(bytes.Buffer)
Expand Down
7 changes: 7 additions & 0 deletions core/node/libp2p/fd/sys_not_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
//go:build !linux && !darwin && !windows

package fd

func GetNumFDs() int {
return 0
}
16 changes: 16 additions & 0 deletions core/node/libp2p/fd/sys_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
//go:build linux || darwin
// +build linux darwin

package fd

import (
"golang.org/x/sys/unix"
)

func GetNumFDs() int {
var l unix.Rlimit
if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &l); err != nil {
return 0
}
return int(l.Cur)
}
11 changes: 11 additions & 0 deletions core/node/libp2p/fd/sys_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
//go:build windows

package fd

import (
"math"
)

func GetNumFDs() int {
return math.MaxInt
}
Loading

0 comments on commit 254d81a

Please sign in to comment.