Skip to content

Commit

Permalink
refactor(metrics): use default prometheus go metrics collector, and i…
Browse files Browse the repository at this point in the history
…mplement standard Prometheus gauges for existing metrics (#2219)

* add prometheus endpoint

* wip metrics

* more wip

* grandpa metric, NewIntervalConfig constructor

* remove dot/metrics

* use metrics.IntervalConfig in dot/network

* cleanup, include lib/metrics

* fix lint and license

* give more time to stress tests

* refactor most metrics to update from source

* use internal/httpserver for metrics server

* cr feedback

* Update internal/metrics/metrics.go

Co-authored-by: Eclésio Junior <eclesiomelo.1@gmail.com>

* fix lint

* Update dot/node.go

Co-authored-by: Quentin McGaw <quentin.mcgaw@gmail.com>

* Update dot/state/block.go

Co-authored-by: Quentin McGaw <quentin.mcgaw@gmail.com>

* Update dot/node.go

Co-authored-by: Quentin McGaw <quentin.mcgaw@gmail.com>

* Update internal/metrics/metrics.go

Co-authored-by: Quentin McGaw <quentin.mcgaw@gmail.com>

* cr feedback

* forgot to add tests file

* add license

Co-authored-by: Eclésio Junior <eclesiomelo.1@gmail.com>
Co-authored-by: Quentin McGaw <quentin.mcgaw@gmail.com>
  • Loading branch information
3 people authored Jan 28, 2022
1 parent f8d8657 commit e816e31
Show file tree
Hide file tree
Showing 20 changed files with 380 additions and 386 deletions.
130 changes: 0 additions & 130 deletions dot/metrics/collector.go

This file was deleted.

39 changes: 0 additions & 39 deletions dot/metrics/metrics.go

This file was deleted.

5 changes: 2 additions & 3 deletions dot/network/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

"github.com/ChainSafe/gossamer/dot/telemetry"
"github.com/ChainSafe/gossamer/internal/log"
"github.com/ChainSafe/gossamer/internal/metrics"
)

const (
Expand Down Expand Up @@ -93,9 +94,6 @@ type Config struct {
// privateKey the private key for the network p2p identity
privateKey crypto.PrivKey

// PublishMetrics enables collection of network metrics
PublishMetrics bool

// telemetryInterval how often to send telemetry metrics
telemetryInterval time.Duration

Expand All @@ -107,6 +105,7 @@ type Config struct {
SlotDuration time.Duration

Telemetry telemetry.Client
Metrics metrics.IntervalConfig
}

// build checks the configuration, sets up the private key for the network service,
Expand Down
134 changes: 76 additions & 58 deletions dot/network/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@ import (
"sync"
"time"

"github.com/ethereum/go-ethereum/metrics"
libp2pnetwork "github.com/libp2p/go-libp2p-core/network"
"github.com/libp2p/go-libp2p-core/peer"
"github.com/libp2p/go-libp2p-core/protocol"

gssmrmetrics "github.com/ChainSafe/gossamer/dot/metrics"
"github.com/ChainSafe/gossamer/dot/peerset"
"github.com/ChainSafe/gossamer/dot/telemetry"
"github.com/ChainSafe/gossamer/internal/log"
"github.com/ChainSafe/gossamer/internal/metrics"
"github.com/ChainSafe/gossamer/lib/common"
"github.com/ChainSafe/gossamer/lib/services"
libp2pnetwork "github.com/libp2p/go-libp2p-core/network"
"github.com/libp2p/go-libp2p-core/peer"
"github.com/libp2p/go-libp2p-core/protocol"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)

const (
Expand All @@ -35,14 +35,58 @@ const (
transactionsID = "/transactions/1"

maxMessageSize = 1024 * 63 // 63kb for now

gssmrIsMajorSyncMetric = "gossamer/network/is_major_syncing"
)

var (
_ services.Service = &Service{}
logger = log.NewFromGlobal(log.AddContext("pkg", "network"))
maxReads = 256

peerCountGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_node",
Name: "peer_count_total",
Help: "total peer count",
})
connectionsGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_node",
Name: "connections_total",
Help: "total number of connections",
})
nodeLatencyGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_node",
Name: "latency_ms",
Help: "average node latency in milliseconds",
})
inboundBlockAnnounceStreamsGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_streams_block_announce",
Name: "inbound_total",
Help: "total number of inbound block announce streams",
})
outboundBlockAnnounceStreamsGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_streams_block_announce",
Name: "outbound_total",
Help: "total number of outbound block announce streams",
})
inboundGrandpaStreamsGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_streams_grandpa",
Name: "inbound_total",
Help: "total number of inbound grandpa streams",
})
outboundGrandpaStreamsGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_streams_grandpa",
Name: "outbound_total",
Help: "total number of outbound grandpa streams",
})
inboundStreamsGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_streams",
Name: "inbound_total",
Help: "total number of inbound streams",
})
outboundStreamsGauge = promauto.NewGauge(prometheus.GaugeOpts{
Namespace: "gossamer_network_streams",
Name: "outbound_total",
Help: "total number of outbound streams",
})
)

type (
Expand Down Expand Up @@ -83,6 +127,8 @@ type Service struct {
noMDNS bool
noGossip bool // internal option

Metrics metrics.IntervalConfig

// telemetry
telemetryInterval time.Duration
closeCh chan struct{}
Expand Down Expand Up @@ -166,6 +212,7 @@ func NewService(cfg *Config) (*Service, error) {
streamManager: newStreamManager(ctx),
blockResponseBuf: make([]byte, maxBlockResponseSize),
telemetry: cfg.Telemetry,
Metrics: cfg.Metrics,
}

return network, err
Expand Down Expand Up @@ -277,8 +324,8 @@ func (s *Service) Start() error {

logger.Info("started network service with supported protocols " + strings.Join(s.host.protocols(), ", "))

if s.cfg.PublishMetrics {
go s.collectNetworkMetrics()
if s.Metrics.Publish {
go s.updateMetrics()
}

go s.logPeerCount()
Expand All @@ -289,44 +336,27 @@ func (s *Service) Start() error {
return nil
}

func (s *Service) collectNetworkMetrics() {
func (s *Service) updateMetrics() {
ticker := time.NewTicker(s.Metrics.Interval)
defer ticker.Stop()
for {
peerCount := metrics.GetOrRegisterGauge("network/node/peerCount", metrics.DefaultRegistry)
totalConn := metrics.GetOrRegisterGauge("network/node/totalConnection", metrics.DefaultRegistry)
networkLatency := metrics.GetOrRegisterGauge("network/node/latency", metrics.DefaultRegistry)
syncedBlocks := metrics.GetOrRegisterGauge(
"service/blocks/sync",
metrics.DefaultRegistry)
numInboundBlockAnnounceStreams := metrics.GetOrRegisterGauge(
"network/streams/block_announce/inbound",
metrics.DefaultRegistry)
numOutboundBlockAnnounceStreams := metrics.GetOrRegisterGauge(
"network/streams/block_announce/outbound",
metrics.DefaultRegistry)
numInboundGrandpaStreams := metrics.GetOrRegisterGauge("network/streams/grandpa/inbound", metrics.DefaultRegistry)
numOutboundGrandpaStreams := metrics.GetOrRegisterGauge("network/streams/grandpa/outbound", metrics.DefaultRegistry)
totalInboundStreams := metrics.GetOrRegisterGauge("network/streams/total/inbound", metrics.DefaultRegistry)
totalOutboundStreams := metrics.GetOrRegisterGauge("network/streams/total/outbound", metrics.DefaultRegistry)

peerCount.Update(int64(s.host.peerCount()))
totalConn.Update(int64(len(s.host.h.Network().Conns())))
networkLatency.Update(int64(s.host.h.Peerstore().LatencyEWMA(s.host.id())))

numInboundBlockAnnounceStreams.Update(s.getNumStreams(BlockAnnounceMsgType, true))
numOutboundBlockAnnounceStreams.Update(s.getNumStreams(BlockAnnounceMsgType, false))
numInboundGrandpaStreams.Update(s.getNumStreams(ConsensusMsgType, true))
numOutboundGrandpaStreams.Update(s.getNumStreams(ConsensusMsgType, false))
totalInboundStreams.Update(s.getTotalStreams(true))
totalOutboundStreams.Update(s.getTotalStreams(false))

num, err := s.blockState.BestBlockNumber()
if err != nil {
syncedBlocks.Update(0)
} else {
syncedBlocks.Update(num.Int64())
select {
case <-s.ctx.Done():
return
case <-ticker.C:
peerCountGauge.Set(float64(s.host.peerCount()))
connectionsGauge.Set(float64(len(s.host.h.Network().Conns())))
nodeLatencyGauge.Set(float64(
s.host.h.Peerstore().LatencyEWMA(s.host.id()).Milliseconds()))
inboundBlockAnnounceStreamsGauge.Set(float64(
s.getNumStreams(BlockAnnounceMsgType, true)))
outboundBlockAnnounceStreamsGauge.Set(float64(
s.getNumStreams(BlockAnnounceMsgType, false)))
inboundGrandpaStreamsGauge.Set(float64(s.getNumStreams(ConsensusMsgType, true)))
outboundGrandpaStreamsGauge.Set(float64(s.getNumStreams(ConsensusMsgType, false)))
inboundStreamsGauge.Set(float64(s.getTotalStreams(true)))
outboundStreamsGauge.Set(float64(s.getTotalStreams(false)))
}

time.Sleep(gssmrmetrics.RefreshInterval)
}
}

Expand Down Expand Up @@ -642,18 +672,6 @@ func (s *Service) NodeRoles() byte {
return s.cfg.Roles
}

// CollectGauge will be used to collect countable metrics from network service
func (s *Service) CollectGauge() map[string]int64 {
var isSynced int64
if !s.syncer.IsSynced() {
isSynced = 1
}

return map[string]int64{
gssmrIsMajorSyncMetric: isSynced,
}
}

// HighestBlock returns the highest known block number
func (*Service) HighestBlock() int64 {
// TODO: refactor this to get the data from the sync service (#1857)
Expand Down
Loading

0 comments on commit e816e31

Please sign in to comment.