Skip to content

Commit

Permalink
feat: refactor consumer, add runtime metrics (#302)
Browse files Browse the repository at this point in the history
  • Loading branch information
crimson-gao authored Nov 28, 2024
1 parent 22f712a commit 7e1c08e
Show file tree
Hide file tree
Showing 7 changed files with 367 additions and 226 deletions.
2 changes: 2 additions & 0 deletions consumer/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ type LogHubConfig struct {
//:param AutoCommitIntervalInSec: default auto commit interval, default is 30
//:param AuthVersion: signature algorithm version, default is sls.AuthV1
//:param Region: region of sls endpoint, eg. cn-hangzhou, region must be set if AuthVersion is sls.AuthV4
//:param DisableRuntimeMetrics: disable runtime metrics, runtime metrics prints to local log.
Endpoint string
AccessKeyID string
AccessKeySecret string
Expand Down Expand Up @@ -81,6 +82,7 @@ type LogHubConfig struct {
AutoCommitIntervalInMS int64
AuthVersion sls.AuthVersionType
Region string
DisableRuntimeMetrics bool
}

const (
Expand Down
116 changes: 116 additions & 0 deletions consumer/shard_monitor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package consumerLibrary

import (
"fmt"
"math"
"time"

"go.uber.org/atomic"

sls "github.com/aliyun/aliyun-log-go-sdk"
"github.com/go-kit/kit/log"
)

type MonitorMetrics struct {
fetchReqFailedCount atomic.Int64
logRawSize atomic.Int64
fetchLogHistogram TimeHistogram // in us

processFailedCount atomic.Int64
processHistogram TimeHistogram // in us
}

type ShardMonitor struct {
shard int
reportInterval time.Duration
lastReportTime time.Time
metrics atomic.Value // *MonitorMetrics
}

func newShardMonitor(shard int, reportInterval time.Duration) *ShardMonitor {
monitor := &ShardMonitor{
shard: shard,
reportInterval: reportInterval,
lastReportTime: time.Now(),
}
monitor.metrics.Store(&MonitorMetrics{})
return monitor
}

func (m *ShardMonitor) RecordFetchRequest(plm *sls.PullLogMeta, err error, start time.Time) {
metrics := m.metrics.Load().(*MonitorMetrics)
if err != nil {
metrics.fetchReqFailedCount.Inc()
} else {
metrics.logRawSize.Add(int64(plm.RawSize))
}
metrics.fetchLogHistogram.AddSample(float64(time.Since(start).Microseconds()))
}

func (m *ShardMonitor) RecordProcess(err error, start time.Time) {
metrics := m.metrics.Load().(*MonitorMetrics)
if err != nil {
metrics.processFailedCount.Inc()
}
metrics.processHistogram.AddSample(float64(time.Since(start).Microseconds()))
}

func (m *ShardMonitor) getAndResetMetrics() *MonitorMetrics {
// we dont need cmp and swap, only one thread would call m.metrics.Store
old := m.metrics.Load().(*MonitorMetrics)
m.metrics.Store(&MonitorMetrics{})
return old
}

func (m *ShardMonitor) shouldReport() bool {
return time.Since(m.lastReportTime) >= m.reportInterval
}

func (m *ShardMonitor) reportByLogger(logger log.Logger) {
m.lastReportTime = time.Now()
metrics := m.getAndResetMetrics()
logger.Log("msg", "report status",
"fetchFailed", metrics.fetchReqFailedCount.Load(),
"logRawSize", metrics.logRawSize.Load(),
"processFailed", metrics.processFailedCount.Load(),
"fetch", metrics.fetchLogHistogram.String(),
"process", metrics.processHistogram.String(),
)
}

type TimeHistogram struct {
Count atomic.Int64
Sum atomic.Float64
SumSquare atomic.Float64
}

func (h *TimeHistogram) AddSample(v float64) {
h.Count.Inc()
h.Sum.Add(v)
h.SumSquare.Add(v * v)
}

func (h *TimeHistogram) String() string {
avg := h.Avg()
stdDev := h.StdDev()
count := h.Count.Load()
return fmt.Sprintf("{avg: %.1fus, stdDev: %.1fus, count: %d}", avg, stdDev, count)
}

func (h *TimeHistogram) Avg() float64 {
count := h.Count.Load()
if count == 0 {
return 0
}
return h.Sum.Load() / float64(count)
}

func (h *TimeHistogram) StdDev() float64 {
count := h.Count.Load()
if count < 2 {
return 0
}
div := float64(count * (count - 1))
num := (float64(count) * h.SumSquare.Load()) - math.Pow(h.Sum.Load(), 2)
return math.Sqrt(num / div)
}
33 changes: 33 additions & 0 deletions consumer/shard_monitor_benchmark_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package consumerLibrary

import (
"testing"
"time"

sls "github.com/aliyun/aliyun-log-go-sdk"
)

// BenchmarkRecordFetchRequest
// BenchmarkRecordFetchRequest-12 29816072 40.05 ns/op 0 B/op 0 allocs/op
func BenchmarkRecordFetchRequest(b *testing.B) {
shardMonitor := newShardMonitor(1, time.Second)
start := time.Now()
plm := &sls.PullLogMeta{RawSize: 1}
b.ResetTimer()

for i := 0; i < b.N; i++ {
shardMonitor.RecordFetchRequest(plm, nil, start)
}
}

// BenchmarkRecordProcess
// BenchmarkRecordProcess-12 33092797 35.15 ns/op 0 B/op 0 allocs/op
func BenchmarkRecordProcess(b *testing.B) {
shardMonitor := newShardMonitor(1, time.Second)
start := time.Now()
b.ResetTimer()

for i := 0; i < b.N; i++ {
shardMonitor.RecordProcess(nil, start)
}
}
Loading

0 comments on commit 7e1c08e

Please sign in to comment.