Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve logging and renaming PrimaryTermStartTimestamp in vttablets #13625

Merged
merged 3 commits into from
Aug 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go/test/endtoend/reparent/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ func CheckReparentFromOutside(t *testing.T, clusterInstance *cluster.LocalProces
streamHealthResponse := shrs[0]

assert.Equal(t, streamHealthResponse.Target.TabletType, topodatapb.TabletType_PRIMARY)
assert.True(t, streamHealthResponse.TabletExternallyReparentedTimestamp >= baseTime)
assert.True(t, streamHealthResponse.PrimaryTermStartTimestamp >= baseTime)
}

// WaitForReplicationPosition waits for tablet B to catch up to the replication position of tablet A.
Expand Down
28 changes: 14 additions & 14 deletions go/test/endtoend/tabletmanager/primary/tablet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,11 +155,11 @@ func TestRepeatedInitShardPrimary(t *testing.T) {
checkTabletType(t, replicaTablet.Alias, "REPLICA")
}

func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
func TestPrimaryRestartSetsPTSTimestamp(t *testing.T) {
defer cluster.PanicHandler(t)
// Test that TER timestamp is set when we restart the PRIMARY vttablet.
// TER = TabletExternallyReparented.
// See StreamHealthResponse.tablet_externally_reparented_timestamp for details.
// Test that PTS timestamp is set when we restart the PRIMARY vttablet.
// PTS = PrimaryTermStart.
// See StreamHealthResponse.primary_term_start_timestamp for details.

// Make replica as primary
err := clusterInstance.VtctlclientProcess.InitShardPrimary(keyspaceName, shardName, cell, replicaTablet.TabletUID)
Expand All @@ -168,7 +168,7 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
err = replicaTablet.VttabletProcess.WaitForTabletStatus("SERVING")
require.NoError(t, err)

// Capture the current TER.
// Capture the current PTS.
shrs, err := clusterInstance.StreamTabletHealth(context.Background(), &replicaTablet, 1)
require.NoError(t, err)

Expand All @@ -178,9 +178,9 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
got := fmt.Sprintf("%d", actualType)
want := fmt.Sprintf("%d", tabletType)
assert.Equal(t, want, got)
assert.NotNil(t, streamHealthRes1.GetTabletExternallyReparentedTimestamp())
assert.True(t, streamHealthRes1.GetTabletExternallyReparentedTimestamp() > 0,
"TER on PRIMARY must be set after InitShardPrimary")
assert.NotNil(t, streamHealthRes1.GetPrimaryTermStartTimestamp())
assert.True(t, streamHealthRes1.GetPrimaryTermStartTimestamp() > 0,
"PTS on PRIMARY must be set after InitShardPrimary")

// Restart the PRIMARY vttablet and test again

Expand All @@ -192,7 +192,7 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
err = clusterInstance.StartVttablet(&replicaTablet, "SERVING", false, cell, keyspaceName, hostname, shardName)
require.NoError(t, err)

// Make sure that the TER did not change
// Make sure that the PTS did not change
shrs, err = clusterInstance.StreamTabletHealth(context.Background(), &replicaTablet, 1)
require.NoError(t, err)

Expand All @@ -204,12 +204,12 @@ func TestPrimaryRestartSetsTERTimestamp(t *testing.T) {
want = fmt.Sprintf("%d", tabletType)
assert.Equal(t, want, got)

assert.NotNil(t, streamHealthRes2.GetTabletExternallyReparentedTimestamp())
assert.True(t, streamHealthRes2.GetTabletExternallyReparentedTimestamp() == streamHealthRes1.GetTabletExternallyReparentedTimestamp(),
assert.NotNil(t, streamHealthRes2.GetPrimaryTermStartTimestamp())
assert.True(t, streamHealthRes2.GetPrimaryTermStartTimestamp() == streamHealthRes1.GetPrimaryTermStartTimestamp(),
fmt.Sprintf("When the PRIMARY vttablet was restarted, "+
"the TER timestamp must be set by reading the old value from the tablet record. Old: %d, New: %d",
streamHealthRes1.GetTabletExternallyReparentedTimestamp(),
streamHealthRes2.GetTabletExternallyReparentedTimestamp()))
"the PTS timestamp must be set by reading the old value from the tablet record. Old: %d, New: %d",
streamHealthRes1.GetPrimaryTermStartTimestamp(),
streamHealthRes2.GetPrimaryTermStartTimestamp()))

// Reset primary
err = clusterInstance.VtctlclientProcess.InitShardPrimary(keyspaceName, shardName, cell, primaryTablet.TabletUID)
Expand Down
248 changes: 124 additions & 124 deletions go/vt/discovery/healthcheck_test.go

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion go/vt/discovery/tablet_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ func (thc *tabletHealthCheck) processResponse(hc *HealthCheckImpl, shr *query.St
prevTarget.TabletType != topodata.TabletType_PRIMARY && prevTarget.TabletType == shr.Target.TabletType && thc.isTrivialReplagChange(shr.RealtimeStats)
thc.lastResponseTimestamp = time.Now()
thc.Target = shr.Target
thc.PrimaryTermStartTime = shr.TabletExternallyReparentedTimestamp
thc.PrimaryTermStartTime = shr.PrimaryTermStartTimestamp
thc.Stats = shr.RealtimeStats
thc.LastError = healthErr
reason := "healthCheck update"
Expand Down
248 changes: 123 additions & 125 deletions go/vt/proto/query/query.pb.go

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions go/vt/proto/query/query_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions go/vt/vtctld/tablet_data_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ limitations under the License.
package vtctld

import (
"context"
"io"
"sync"
"testing"
"time"

"context"

"google.golang.org/protobuf/proto"

"vitess.io/vitess/go/vt/logutil"
Expand Down Expand Up @@ -93,7 +92,7 @@ func (s *streamHealthTabletServer) streamHealthUnregister(id int) error {
// BroadcastHealth will broadcast the current health to all listeners
func (s *streamHealthTabletServer) BroadcastHealth() {
shr := &querypb.StreamHealthResponse{
TabletExternallyReparentedTimestamp: 42,
PrimaryTermStartTimestamp: 42,
RealtimeStats: &querypb.RealtimeStats{
HealthError: "testHealthError",
ReplicationLagSeconds: 72,
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletconntest/fakequeryservice.go
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ var TestStreamHealthStreamHealthResponse = &querypb.StreamHealthResponse{
},
Serving: true,

TabletExternallyReparentedTimestamp: 1234589,
PrimaryTermStartTimestamp: 1234589,

RealtimeStats: &querypb.RealtimeStats{
CpuUsage: 1.0,
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vttablet/tabletmanager/tm_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ func (ts *tmState) updateLocked(ctx context.Context) error {
return nil
}

terTime := logutil.ProtoToTime(ts.tablet.PrimaryTermStartTime)
ptsTime := logutil.ProtoToTime(ts.tablet.PrimaryTermStartTime)

// Disable TabletServer first so the nonserving state gets advertised
// before other services are shutdown.
Expand All @@ -277,7 +277,7 @@ func (ts *tmState) updateLocked(ctx context.Context) error {
// always return error from 'SetServingType' and 'applyDenyList' to our client. It is up to them to handle it accordingly.
// UpdateLock is called from 'ChangeTabletType', 'Open' and 'RefreshFromTopoInfo'. For 'Open' and 'RefreshFromTopoInfo' we don't need
// to propagate error to client hence no changes there but we will propagate error from 'ChangeTabletType' to client.
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, terTime, false, reason); err != nil {
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, ptsTime, false, reason); err != nil {
errStr := fmt.Sprintf("SetServingType(serving=false) failed: %v", err)
log.Errorf(errStr)
// No need to short circuit. Apply all steps and return error in the end.
Expand Down Expand Up @@ -326,7 +326,7 @@ func (ts *tmState) updateLocked(ctx context.Context) error {

// Open TabletServer last so that it advertises serving after all other services are up.
if reason == "" {
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, terTime, true, ""); err != nil {
if err := ts.tm.QueryServiceControl.SetServingType(ts.tablet.Type, ptsTime, true, ""); err != nil {
errStr := fmt.Sprintf("Cannot start query service: %v", err)
log.Errorf(errStr)
returnErr = vterrors.Wrapf(err, errStr)
Expand Down
5 changes: 2 additions & 3 deletions go/vt/vttablet/tabletserver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package tabletserver

import (
"context"
"time"

"vitess.io/vitess/go/vt/dbconfigs"
"vitess.io/vitess/go/vt/mysqlctl"
Expand All @@ -27,8 +28,6 @@ import (
"vitess.io/vitess/go/vt/vttablet/tabletserver/schema"
"vitess.io/vitess/go/vt/vttablet/tabletserver/tabletenv"

"time"

querypb "vitess.io/vitess/go/vt/proto/query"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)
Expand All @@ -52,7 +51,7 @@ type Controller interface {

// SetServingType transitions the query service to the required serving type.
// Returns true if the state of QueryService or the tablet type changed.
SetServingType(tabletType topodatapb.TabletType, terTimestamp time.Time, serving bool, reason string) error
SetServingType(tabletType topodatapb.TabletType, ptsTimestamp time.Time, serving bool, reason string) error

// EnterLameduck causes tabletserver to enter the lameduck state.
EnterLameduck()
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vttablet/tabletserver/health_streamer.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,15 +208,15 @@ func (hs *healthStreamer) unregister(ch chan *querypb.StreamHealthResponse) {
delete(hs.clients, ch)
}

func (hs *healthStreamer) ChangeState(tabletType topodatapb.TabletType, terTimestamp time.Time, lag time.Duration, err error, serving bool) {
func (hs *healthStreamer) ChangeState(tabletType topodatapb.TabletType, ptsTimestamp time.Time, lag time.Duration, err error, serving bool) {
hs.mu.Lock()
defer hs.mu.Unlock()

hs.state.Target.TabletType = tabletType
if tabletType == topodatapb.TabletType_PRIMARY {
hs.state.TabletExternallyReparentedTimestamp = terTimestamp.Unix()
hs.state.PrimaryTermStartTimestamp = ptsTimestamp.Unix()
} else {
hs.state.TabletExternallyReparentedTimestamp = 0
hs.state.PrimaryTermStartTimestamp = 0
}
if err != nil {
hs.state.RealtimeStats.HealthError = err.Error()
Expand Down
6 changes: 3 additions & 3 deletions go/vt/vttablet/tabletserver/health_streamer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,9 @@ func TestHealthStreamerBroadcast(t *testing.T) {
Target: &querypb.Target{
TabletType: topodatapb.TabletType_PRIMARY,
},
TabletAlias: alias,
Serving: true,
TabletExternallyReparentedTimestamp: now.Unix(),
TabletAlias: alias,
Serving: true,
PrimaryTermStartTimestamp: now.Unix(),
RealtimeStats: &querypb.RealtimeStats{
FilteredReplicationLagSeconds: 1,
BinlogPlayersCount: 2,
Expand Down
16 changes: 8 additions & 8 deletions go/vt/vttablet/tabletserver/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ type stateManager struct {
wantTabletType topodatapb.TabletType
state servingState
target *querypb.Target
terTimestamp time.Time
ptsTimestamp time.Time
retrying bool
replHealthy bool
lameduck bool
Expand Down Expand Up @@ -209,7 +209,7 @@ func (sm *stateManager) Init(env tabletenv.Env, target *querypb.Target) {
// be honored.
// If sm is already in the requested state, it returns stateChanged as
// false.
func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, terTimestamp time.Time, state servingState, reason string) error {
func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, ptsTimestamp time.Time, state servingState, reason string) error {
defer sm.ExitLameduck()

sm.hs.Open()
Expand All @@ -219,8 +219,8 @@ func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, terTime
state = StateNotConnected
}

log.Infof("Starting transition to %v %v, timestamp: %v", tabletType, state, terTimestamp)
if sm.mustTransition(tabletType, terTimestamp, state, reason) {
log.Infof("Starting transition to %v %v, primary term start timestamp: %v", tabletType, state, ptsTimestamp)
if sm.mustTransition(tabletType, ptsTimestamp, state, reason) {
return sm.execTransition(tabletType, state)
}
return nil
Expand All @@ -230,7 +230,7 @@ func (sm *stateManager) SetServingType(tabletType topodatapb.TabletType, terTime
// state. If so, it acquires the semaphore and returns true. If a transition is
// already in progress, it waits. If the desired state is already reached, it
// returns false without acquiring the semaphore.
func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, terTimestamp time.Time, state servingState, reason string) bool {
func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, ptsTimestamp time.Time, state servingState, reason string) bool {
if sm.transitioning.Acquire(context.Background(), 1) != nil {
return false
}
Expand All @@ -239,7 +239,7 @@ func (sm *stateManager) mustTransition(tabletType topodatapb.TabletType, terTime

sm.wantTabletType = tabletType
sm.wantState = state
sm.terTimestamp = terTimestamp
sm.ptsTimestamp = ptsTimestamp
sm.reason = reason
if sm.target.TabletType == tabletType && sm.state == state {
sm.transitioning.Release(1)
Expand Down Expand Up @@ -639,7 +639,7 @@ func (sm *stateManager) stateStringLocked(tabletType topodatapb.TabletType, stat
if tabletType != topodatapb.TabletType_PRIMARY {
return fmt.Sprintf("%v: %v", tabletType, state)
}
return fmt.Sprintf("%v: %v, %v", tabletType, state, sm.terTimestamp.Local().Format("Jan 2, 2006 at 15:04:05 (MST)"))
return fmt.Sprintf("%v: %v, %v", tabletType, state, sm.ptsTimestamp.Local().Format("Jan 2, 2006 at 15:04:05 (MST)"))
}

func (sm *stateManager) handleGracePeriod(tabletType topodatapb.TabletType) {
Expand Down Expand Up @@ -674,7 +674,7 @@ func (sm *stateManager) Broadcast() {
defer sm.mu.Unlock()

lag, err := sm.refreshReplHealthLocked()
sm.hs.ChangeState(sm.target.TabletType, sm.terTimestamp, lag, err, sm.isServingLocked())
sm.hs.ChangeState(sm.target.TabletType, sm.ptsTimestamp, lag, err, sm.isServingLocked())
}

func (sm *stateManager) refreshReplHealthLocked() (time.Duration, error) {
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletserver/state_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func TestStateManagerServePrimary(t *testing.T) {
require.NoError(t, err)

assert.Equal(t, false, sm.lameduck)
assert.Equal(t, testNow, sm.terTimestamp)
assert.Equal(t, testNow, sm.ptsTimestamp)

verifySubcomponent(t, 1, sm.watcher, testStateClosed)

Expand Down
4 changes: 2 additions & 2 deletions go/vt/vttablet/tabletserver/tabletserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,12 +371,12 @@ func (tsv *TabletServer) InitACL(tableACLConfigFile string, enforceTableACLConfi
// SetServingType changes the serving type of the tabletserver. It starts or
// stops internal services as deemed necessary.
// Returns true if the state of QueryService or the tablet type changed.
func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, terTimestamp time.Time, serving bool, reason string) error {
func (tsv *TabletServer) SetServingType(tabletType topodatapb.TabletType, ptsTimestamp time.Time, serving bool, reason string) error {
state := StateNotServing
if serving {
state = StateServing
}
return tsv.sm.SetServingType(tabletType, terTimestamp, state, reason)
return tsv.sm.SetServingType(tabletType, ptsTimestamp, state, reason)
}

// StartService is a convenience function for InitDBConfig->SetServingType
Expand Down
2 changes: 1 addition & 1 deletion go/vt/vttablet/tabletservermock/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ func (tqsc *Controller) InitDBConfig(target *querypb.Target, dbcfgs *dbconfigs.D
}

// SetServingType is part of the tabletserver.Controller interface
func (tqsc *Controller) SetServingType(tabletType topodatapb.TabletType, terTime time.Time, serving bool, reason string) error {
func (tqsc *Controller) SetServingType(tabletType topodatapb.TabletType, ptsTime time.Time, serving bool, reason string) error {
tqsc.mu.Lock()
defer tqsc.mu.Unlock()

Expand Down
8 changes: 4 additions & 4 deletions proto/query.proto
Original file line number Diff line number Diff line change
Expand Up @@ -922,8 +922,8 @@ message StreamHealthResponse {
// or if a replica should not be used because the keyspace is being resharded.
bool serving = 2;

// tablet_externally_reparented_timestamp can be interpreted as the
// last time we knew that this tablet was the PRIMARY of this shard
// primary_term_start_timestamp can be interpreted as the
// last time we knew that this tablet was promoted to a PRIMARY of this shard
// (if StreamHealthResponse describes a group of tablets, between
// two vtgates, only one primary will be present in the group, and
// this is this primary's value).
Expand All @@ -947,8 +947,8 @@ message StreamHealthResponse {
// as PRIMARY because it was recorded as the shard's current primary in the
// topology (see go/vt/vttablet/tabletmanager/init_tablet.go)
// OR
// d) 0 if the vttablet was never a PRIMARY.
int64 tablet_externally_reparented_timestamp = 3;
// d) 0 if the vttablet is not a PRIMARY.
int64 primary_term_start_timestamp = 3;

// realtime_stats contains information about the tablet status.
// It is only filled in if the information is about a tablet.
Expand Down
8 changes: 4 additions & 4 deletions web/vtadmin/src/proto/vtadmin.d.ts

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading