@@ -15,7 +15,6 @@ package maintainer
15
15
16
16
import (
17
17
"encoding/json"
18
- "fmt"
19
18
"math"
20
19
"sync"
21
20
"time"
@@ -100,6 +99,7 @@ type Maintainer struct {
100
99
scheduledTaskGauge prometheus.Gauge
101
100
runningTaskGauge prometheus.Gauge
102
101
tableCountGauge prometheus.Gauge
102
+ handleEventDuration prometheus.Observer
103
103
}
104
104
105
105
// NewMaintainer create the maintainer for the changefeed
@@ -143,6 +143,7 @@ func NewMaintainer(cfID model.ChangeFeedID,
143
143
scheduledTaskGauge : metrics .ScheduleTaskGuage .WithLabelValues (cfID .Namespace , cfID .ID ),
144
144
runningTaskGauge : metrics .RunningScheduleTaskGauge .WithLabelValues (cfID .Namespace , cfID .ID ),
145
145
tableCountGauge : metrics .TableGauge .WithLabelValues (cfID .Namespace , cfID .ID ),
146
+ handleEventDuration : metrics .MaintainerHandleEventDuration .WithLabelValues (cfID .Namespace , cfID .ID ),
146
147
}
147
148
m .bootstrapper = NewBootstrapper (m .id .ID , m .getNewBootstrapFn ())
148
149
m .barrier = NewBarrier (m .scheduler )
@@ -159,12 +160,13 @@ func (m *Maintainer) HandleEvent(event *Event) bool {
159
160
start := time .Now ()
160
161
defer func () {
161
162
duration := time .Since (start )
162
- if duration > time .Millisecond * 500 {
163
+ if duration > time .Second {
163
164
log .Info ("maintainer is too slow" ,
164
165
zap .String ("id" , m .id .String ()),
165
166
zap .Int ("type" , event .eventType ),
166
167
zap .Duration ("duration" , duration ))
167
168
}
169
+ m .handleEventDuration .Observe (duration .Seconds ())
168
170
}()
169
171
if m .state == heartbeatpb .ComponentState_Stopped {
170
172
log .Warn ("maintainer is not stopped, ignore" ,
@@ -274,6 +276,7 @@ func (m *Maintainer) cleanupMetrics() {
274
276
metrics .ScheduleTaskGuage .DeleteLabelValues (m .id .Namespace , m .id .ID )
275
277
metrics .RunningScheduleTaskGauge .DeleteLabelValues (m .id .Namespace , m .id .ID )
276
278
metrics .TableGauge .DeleteLabelValues (m .id .Namespace , m .id .ID )
279
+ metrics .MaintainerHandleEventDuration .DeleteLabelValues (m .id .Namespace , m .id .ID )
277
280
}
278
281
279
282
func (m * Maintainer ) onInit () bool {
@@ -407,7 +410,7 @@ func (m *Maintainer) updateMetrics() {
407
410
m .changefeedStatusGauge .Set (float64 (m .state ))
408
411
}
409
412
410
- // send message to remote, todo: use a io thread pool
413
+ // send message to remote
411
414
func (m * Maintainer ) sendMessages (msgs []* messaging.TargetMessage ) {
412
415
for _ , msg := range msgs {
413
416
err := m .mc .SendCommand (msg )
@@ -628,15 +631,15 @@ func (m *Maintainer) onPeriodTask() {
628
631
}
629
632
// send scheduling messages
630
633
m .handleResendMessage ()
631
- m .printStatus ()
634
+ m .collectMetrics ()
632
635
m .calCheckpointTs ()
633
636
SubmitScheduledEvent (m .taskScheduler , m .stream , & Event {
634
637
changefeedID : m .id .ID ,
635
638
eventType : EventPeriod ,
636
639
}, time .Now ().Add (time .Millisecond * 500 ))
637
640
}
638
641
639
- func (m * Maintainer ) printStatus () {
642
+ func (m * Maintainer ) collectMetrics () {
640
643
if time .Since (m .lastPrintStatusTime ) > time .Second * 20 {
641
644
tableStates := make (map [scheduler.SchedulerStatus ]int )
642
645
total := m .scheduler .TaskSize ()
@@ -650,20 +653,6 @@ func (m *Maintainer) printStatus() {
650
653
for state , count := range tableStates {
651
654
metrics .TableStateGauge .WithLabelValues (m .id .Namespace , m .id .ID , state .String ()).Set (float64 (count ))
652
655
}
653
-
654
- var taskDistribution string
655
- for nodeID , _ := range m .bootstrapper .GetAllNodes () {
656
- taskDistribution = fmt .Sprintf ("%s, %s=%d" ,
657
- taskDistribution , nodeID , m .scheduler .GetTaskSizeByNodeID (nodeID ))
658
- }
659
- log .Info ("table span status" ,
660
- zap .String ("distribution" , taskDistribution ),
661
- zap .String ("changefeed" , m .id .ID ),
662
- zap .Int ("total" , total ),
663
- zap .Int ("absent" , tableStates [scheduler .SchedulerStatusAbsent ]),
664
- zap .Int ("commiting" , tableStates [scheduler .SchedulerStatusCommiting ]),
665
- zap .Int ("working" , tableStates [scheduler .SchedulerStatusWorking ]),
666
- zap .Int ("removing" , tableStates [scheduler .SchedulerStatusRemoving ]))
667
656
m .lastPrintStatusTime = time .Now ()
668
657
}
669
658
}
0 commit comments