Skip to content

Commit

Permalink
[raft] fix removeDataReq in removeZombie and also add some logging (#…
Browse files Browse the repository at this point in the history
…8208)

added logging to track how range descriptor changes in driver.
  • Loading branch information
luluz66 authored Jan 18, 2025
1 parent e0318af commit 26a4fc9
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
10 changes: 6 additions & 4 deletions enterprise/server/raft/driver/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ func (rq *Queue) computeAction(rd *rfpb.RangeDescriptor, usage *rfpb.ReplicaUsag
return action, adjustedPriority
}
}
} else {
rq.log.Debugf("cannot split range %d: num of suspect replicas: %d, num deadReplicas: %d, num replicas marked for removal: %d", rd.GetRangeId(), len(replicasByStatus.SuspectReplicas), numDeadReplicas, len(rd.GetRemoved()))
}

if rd.GetRangeId() == constants.MetaRangeID {
Expand Down Expand Up @@ -1041,8 +1043,8 @@ func (rq *Queue) applyChange(ctx context.Context, change *change) error {
rq.log.Errorf("AddReplica %+v err: %s", change.addOp, err)
return err
}
rq.log.Infof("AddReplicaRequest finished: %+v", change.addOp)
rd = rsp.GetRange()
rq.log.Infof("AddReplicaRequest finished: op: %+v, rd: %+v", change.addOp, rd)
}
if change.removeOp != nil {
if rd != nil {
Expand All @@ -1059,7 +1061,7 @@ func (rq *Queue) applyChange(ctx context.Context, change *change) error {
rq.log.Errorf("RemoveReplica %+v err: %s", change.removeOp, err)
return err
}
rq.log.Infof("RemoveReplicaRequest finished: %+v", change.removeOp)
rq.log.Infof("RemoveReplicaRequest finished: op: %+v, rd: %+v", change.removeOp, rsp.GetRange())

replicaDesc := &rfpb.ReplicaDescriptor{RangeId: change.removeOp.GetRange().GetRangeId(), ReplicaId: change.removeOp.GetReplicaId()}
// Remove the data from the now stopped node. This is best-effort only,
Expand All @@ -1070,7 +1072,7 @@ func (rq *Queue) applyChange(ctx context.Context, change *change) error {
rq.log.Warningf("RemoveReplica unable to remove data on c%dn%d, err getting api client: %s", replicaDesc.GetRangeId(), replicaDesc.GetReplicaId(), err)
return nil
}
_, err = c.RemoveData(ctx, &rfpb.RemoveDataRequest{
removeDataRsp, err := c.RemoveData(ctx, &rfpb.RemoveDataRequest{
ReplicaId: replicaDesc.GetReplicaId(),
Range: rsp.GetRange(),
})
Expand All @@ -1079,7 +1081,7 @@ func (rq *Queue) applyChange(ctx context.Context, change *change) error {
return nil
}

rq.log.Infof("Removed shard: c%dn%d", replicaDesc.GetRangeId(), replicaDesc.GetReplicaId())
rq.log.Infof("Removed shard: c%dn%d, rd: %+v", replicaDesc.GetRangeId(), replicaDesc.GetReplicaId(), removeDataRsp.GetRange())
}
if change.transferLeadershipOp != nil {
_, err := rq.store.TransferLeadership(ctx, change.transferLeadershipOp)
Expand Down
8 changes: 7 additions & 1 deletion enterprise/server/raft/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1638,13 +1638,18 @@ func (j *replicaJanitor) scan(ctx context.Context) {
}

func (j *replicaJanitor) removeZombie(ctx context.Context, task zombieCleanupTask) (zombieCleanupAction, error) {
log.Debugf("removing zombie c%dn%d", task.rangeID, task.replicaID)
removeDataReq := &rfpb.RemoveDataRequest{
ReplicaId: task.shardInfo.ReplicaID,
}
if task.action == zombieCleanupNoAction {
return zombieCleanupNoAction, nil
} else if task.action == zombieCleanupRemoveData {
removeDataReq.RangeId = task.rangeID
if task.rd == nil {
removeDataReq.RangeId = task.rangeID
} else {
removeDataReq.Range = task.rd
}
} else if task.action == zombieCleanupRemoveReplica {
// In the rare case where the zombie holds the leader, we try to transfer the leader away first.
if j.store.isLeader(task.shardInfo.ShardID, task.shardInfo.ReplicaID) {
Expand Down Expand Up @@ -2827,6 +2832,7 @@ func (s *Store) markReplicaForRemovalFromRangeDescriptor(ctx context.Context, ra
}

func (s *Store) removeReplicaFromRangeDescriptor(ctx context.Context, rangeID, replicaID uint64, oldDescriptor *rfpb.RangeDescriptor) (*rfpb.RangeDescriptor, error) {
s.log.Infof("removing c%dn%d from range descriptor", rangeID, replicaID)
newDescriptor := proto.Clone(oldDescriptor).(*rfpb.RangeDescriptor)
for i, replica := range newDescriptor.Removed {
if replica.GetReplicaId() == replicaID {
Expand Down

0 comments on commit 26a4fc9

Please sign in to comment.