Skip to content

Commit

Permalink
feat(node | das | libs/header/sync): add total uptime node metrics + …
Browse files Browse the repository at this point in the history
…totalSampled das metrics + totalSynced sync metrics (#1638)

## Overview

This PR introduces node uptime metrics + das total sampled headers
metrics to support calculating the uptime index proposed by mustafa on
the monitoring side.

This PR introduces a new module named `Telemetry` to support node
related telemetry. This module can also host all general telemetry and
observability that does not interest specific modules.

## Changes

- [x] Introduced uptime metrics for node under
`nodebuilder/node/uptime.go`
- [x] Introduced persistent uptime metrics using datastore to persist
node start time
- [x] Testing for uptime metrics persistence using the store
- [x] Unit testing for uptime metrics
- [x] Integration testing for uptime metrics
- [ ] e2e testing for uptime metrics

## Checklist


- [x] New and updated code has appropriate documentation
- [x] New and updated code has new and/or updated testing
- [x] Required CI checks are passing
- [x] Visual proof for any user facing features like CLI or
documentation updates
- [ ] Linked issues closed with keywords

## Blocked By

PR: #1537

---------

Co-authored-by: rene <41963722+renaynay@users.noreply.github.com>
  • Loading branch information
derrandz and renaynay authored Feb 8, 2023
1 parent abb9fa1 commit 6933f03
Show file tree
Hide file tree
Showing 14 changed files with 306 additions and 28 deletions.
9 changes: 9 additions & 0 deletions das/checkpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,12 @@ func (c checkpoint) String() string {

return str
}

// totalSampled returns the total amount of sampled headers
func (c checkpoint) totalSampled() uint64 {
var totalInProgress uint64
for _, w := range c.Workers {
totalInProgress += (w.To - w.From) + 1
}
return c.SampleFrom - totalInProgress - uint64(len(c.Failed))
}
4 changes: 4 additions & 0 deletions das/coordinator.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ func newSamplingCoordinator(

func (sc *samplingCoordinator) run(ctx context.Context, cp checkpoint) {
sc.state.resumeFromCheckpoint(cp)

// the amount of sampled headers from the last checkpoint
sc.metrics.recordTotalSampled(cp.totalSampled())

// resume workers
for _, wk := range cp.Workers {
sc.runWorker(ctx, sc.state.newJob(wk.From, wk.To))
Expand Down
60 changes: 52 additions & 8 deletions das/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ type metrics struct {
sampleTime syncfloat64.Histogram
getHeaderTime syncfloat64.Histogram
newHead syncint64.Counter
lastSampledTS int64

lastSampledTS uint64
totalSampledInt uint64
}

func (d *DASer) InitMetrics() error {
Expand Down Expand Up @@ -76,6 +78,16 @@ func (d *DASer) InitMetrics() error {
return err
}

totalSampled, err := meter.
AsyncInt64().
Gauge(
"das_total_sampled_headers",
instrument.WithDescription("total sampled headers gauge"),
)
if err != nil {
return err
}

d.sampler.metrics = &metrics{
sampled: sampled,
sampleTime: sampleTime,
Expand All @@ -85,7 +97,11 @@ func (d *DASer) InitMetrics() error {

err = meter.RegisterCallback(
[]instrument.Asynchronous{
lastSampledTS, busyWorkers, networkHead, sampledChainHead,
lastSampledTS,
busyWorkers,
networkHead,
sampledChainHead,
totalSampled,
},
func(ctx context.Context) {
stats, err := d.sampler.stats(ctx)
Expand All @@ -97,9 +113,12 @@ func (d *DASer) InitMetrics() error {
networkHead.Observe(ctx, int64(stats.NetworkHead))
sampledChainHead.Observe(ctx, int64(stats.SampledChainHead))

if ts := atomic.LoadInt64(&d.sampler.metrics.lastSampledTS); ts != 0 {
lastSampledTS.Observe(ctx, ts)
if ts := atomic.LoadUint64(&d.sampler.metrics.lastSampledTS); ts != 0 {
lastSampledTS.Observe(ctx, int64(ts))
}

totalSampledInt := atomic.LoadUint64(&d.sampler.metrics.totalSampledInt)
totalSampled.Observe(ctx, int64(totalSampledInt))
},
)

Expand All @@ -110,29 +129,54 @@ func (d *DASer) InitMetrics() error {
return nil
}

func (m *metrics) observeSample(ctx context.Context, h *header.ExtendedHeader, sampleTime time.Duration, err error) {
// observeSample records the time it took to sample a header +
// the amount of sampled contiguous headers
func (m *metrics) observeSample(
ctx context.Context,
h *header.ExtendedHeader,
sampleTime time.Duration,
err error,
) {
if m == nil {
return
}
m.sampleTime.Record(ctx, sampleTime.Seconds(),
attribute.Bool("failed", err != nil),
attribute.Int("header_width", len(h.DAH.RowsRoots)))
attribute.Int("header_width", len(h.DAH.RowsRoots)),
)

m.sampled.Add(ctx, 1,
attribute.Bool("failed", err != nil),
attribute.Int("header_width", len(h.DAH.RowsRoots)))
atomic.StoreInt64(&m.lastSampledTS, time.Now().UTC().Unix())
attribute.Int("header_width", len(h.DAH.RowsRoots)),
)

atomic.StoreUint64(&m.lastSampledTS, uint64(time.Now().UTC().Unix()))

if err == nil {
atomic.AddUint64(&m.totalSampledInt, 1)
}
}

// observeGetHeader records the time it took to get a header from the header store.
func (m *metrics) observeGetHeader(ctx context.Context, d time.Duration) {
if m == nil {
return
}
m.getHeaderTime.Record(ctx, d.Seconds())
}

// observeNewHead records the network head.
func (m *metrics) observeNewHead(ctx context.Context) {
if m == nil {
return
}
m.newHead.Add(ctx, 1)
}

// recordTotalSampled records the total sampled headers.
func (m *metrics) recordTotalSampled(totalSampled uint64) {
if m == nil {
return
}
atomic.StoreUint64(&m.totalSampledInt, totalSampled)
}
3 changes: 2 additions & 1 deletion das/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ type Parameters struct {
// SampleFrom is the height sampling will start from if no previous checkpoint was saved
SampleFrom uint64

// SampleTimeout is a maximum amount time sampling of single block may take until it will be canceled
// SampleTimeout is a maximum amount time sampling of single block may take until it will be
// canceled
SampleTimeout time.Duration
}

Expand Down
40 changes: 32 additions & 8 deletions das/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,15 @@ func (w *worker) run(
}

metrics.observeGetHeader(ctx, time.Since(startGet))
log.Debugw("got header from header store", "height", h.Height(), "hash", h.Hash(),
"square width", len(h.DAH.RowsRoots), "data root", h.DAH.Hash(), "finished (s)", time.Since(startGet))

log.Debugw(
"got header from header store",
"height", h.Height(),
"hash", h.Hash(),
"square width", len(h.DAH.RowsRoots),
"data root", h.DAH.Hash(),
"finished (s)", time.Since(startGet),
)

startSample := time.Now()
err = sample(ctx, h)
Expand All @@ -72,18 +79,35 @@ func (w *worker) run(
w.setResult(curr, err)
metrics.observeSample(ctx, h, time.Since(startSample), err)
if err != nil {
log.Debugw("failed to sampled header", "height", h.Height(), "hash", h.Hash(),
"square width", len(h.DAH.RowsRoots), "data root", h.DAH.Hash(), "err", err)
log.Debugw(
"failed to sampled header",
"height", h.Height(),
"hash", h.Hash(),
"square width", len(h.DAH.RowsRoots),
"data root", h.DAH.Hash(),
"err", err,
)
} else {
log.Debugw("sampled header", "height", h.Height(), "hash", h.Hash(),
"square width", len(h.DAH.RowsRoots), "data root", h.DAH.Hash(), "finished (s)", time.Since(startSample))
log.Debugw(
"sampled header",
"height", h.Height(),
"hash", h.Hash(),
"square width", len(h.DAH.RowsRoots),
"data root", h.DAH.Hash(),
"finished (s)", time.Since(startSample),
)
}
}

if w.state.Curr > w.state.From {
jobTime := time.Since(jobStart)
log.Infow("sampled headers", "from", w.state.From, "to", w.state.Curr,
"finished (s)", jobTime.Seconds())
log.Infow(
"sampled headers",
"from", w.state.From,
"to", w.state.Curr,
"finished (s)",
jobTime.Seconds(),
)
}

select {
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,14 @@ require (
go.opentelemetry.io/otel/sdk v1.11.2
go.opentelemetry.io/otel/sdk/metric v0.34.0
go.opentelemetry.io/otel/trace v1.11.2
go.opentelemetry.io/proto/otlp v0.19.0
go.uber.org/fx v1.18.2
go.uber.org/multierr v1.9.0
golang.org/x/crypto v0.5.0
golang.org/x/sync v0.1.0
golang.org/x/text v0.6.0
google.golang.org/grpc v1.52.0
google.golang.org/protobuf v1.28.2-0.20220831092852-f930b1dc76e8
)

require (
Expand Down Expand Up @@ -296,7 +298,6 @@ require (
go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.11.2 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.34.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.11.2 // indirect
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/dig v1.15.0 // indirect
go.uber.org/zap v1.24.0 // indirect
Expand All @@ -311,7 +312,6 @@ require (
google.golang.org/api v0.102.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20221118155620-16455021b5e6 // indirect
google.golang.org/protobuf v1.28.2-0.20220831092852-f930b1dc76e8 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
gopkg.in/natefinch/npipe.v2 v2.0.0-20160621034901-c1b8fa8bdcce // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
Expand Down
9 changes: 6 additions & 3 deletions header/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ import (
"go.opentelemetry.io/otel/metric/unit"

libhead "github.com/celestiaorg/celestia-node/libs/header"
"github.com/celestiaorg/celestia-node/libs/header/sync"
)

var meter = global.MeterProvider().Meter("header")

// WithMetrics enables Otel metrics to monitor head.
func WithMetrics(store libhead.Store[*ExtendedHeader]) {
// WithMetrics enables Otel metrics to monitor head and total amount of synced headers.
func WithMetrics(store libhead.Store[*ExtendedHeader], syncer *sync.Syncer[*ExtendedHeader]) error {
headC, _ := meter.AsyncInt64().Counter(
"head",
instrument.WithUnit(unit.Dimensionless),
Expand All @@ -40,6 +41,8 @@ func WithMetrics(store libhead.Store[*ExtendedHeader]) {
},
)
if err != nil {
panic(err)
return err
}

return syncer.InitMetrics()
}
43 changes: 43 additions & 0 deletions libs/header/sync/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package sync

import (
"context"
"sync/atomic"

"go.opentelemetry.io/otel/metric/global"
"go.opentelemetry.io/otel/metric/instrument"
)

var meter = global.MeterProvider().Meter("header/sync")

type metrics struct {
totalSynced int64
}

func (s *Syncer[H]) InitMetrics() error {
s.metrics = &metrics{}

totalSynced, err := meter.
AsyncFloat64().
Gauge(
"total_synced_headers",
instrument.WithDescription("total synced headers"),
)
if err != nil {
return err
}

return meter.RegisterCallback(
[]instrument.Asynchronous{
totalSynced,
},
func(ctx context.Context) {
totalSynced.Observe(ctx, float64(atomic.LoadInt64(&s.metrics.totalSynced)))
},
)
}

// recordTotalSynced records the total amount of synced headers.
func (m *metrics) recordTotalSynced(totalSynced int) {
atomic.AddInt64(&m.totalSynced, int64(totalSynced))
}
5 changes: 5 additions & 0 deletions libs/header/sync/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ type Syncer[H header.Header] struct {
cancel context.CancelFunc

Params *Parameters

metrics *metrics
}

// NewSyncer creates a new instance of Syncer.
Expand Down Expand Up @@ -233,6 +235,9 @@ func (s *Syncer[H]) doSync(ctx context.Context, fromHead, toHead H) (err error)
if err != nil && processed == 0 {
break
}
if s.metrics != nil {
s.metrics.recordTotalSynced(processed)
}
}

s.stateLk.Lock()
Expand Down
53 changes: 53 additions & 0 deletions nodebuilder/node/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package node

import (
"context"
"time"

"go.opentelemetry.io/otel/metric/global"
"go.opentelemetry.io/otel/metric/instrument"
)

var meter = global.MeterProvider().Meter("node")

var (
timeStarted time.Time
nodeStarted bool
)

// WithMetrics registers node metrics.
func WithMetrics() error {
nodeStartTS, err := meter.
AsyncFloat64().
Gauge(
"node_start_ts",
instrument.WithDescription("timestamp when the node was started"),
)
if err != nil {
return err
}

totalNodeRunTime, err := meter.
AsyncFloat64().
Counter(
"node_runtime_counter_in_seconds",
instrument.WithDescription("total time the node has been running"),
)
if err != nil {
return err
}

return meter.RegisterCallback(
[]instrument.Asynchronous{nodeStartTS, totalNodeRunTime},
func(ctx context.Context) {
if !nodeStarted {
// Observe node start timestamp
timeStarted = time.Now()
nodeStartTS.Observe(ctx, float64(timeStarted.Unix()))
nodeStarted = true
}

totalNodeRunTime.Observe(ctx, time.Since(timeStarted).Seconds())
},
)
}
Loading

0 comments on commit 6933f03

Please sign in to comment.