From 471c1342b102c8d2e5b718986fa25e18dd310494 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Wed, 22 Jun 2022 11:18:10 +0530 Subject: [PATCH] rbd: issue resync only if the force flag is set During failover we do demote the volume on the primary as the image is still not promoted yet on the remote cluster, there are spurious split-brain errors reported by RBD, the Cephcsi resync will attempt to resync from the "known" secondary and that will cause data loss Signed-off-by: Madhu Rajanna (cherry picked from commit 3acaa018dbb971df40b29a848eba1ca0c0420299) --- internal/rbd/replicationcontrollerserver.go | 38 +++++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/internal/rbd/replicationcontrollerserver.go b/internal/rbd/replicationcontrollerserver.go index edeb0c0d3f2..90c885632bc 100644 --- a/internal/rbd/replicationcontrollerserver.go +++ b/internal/rbd/replicationcontrollerserver.go @@ -855,18 +855,11 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, ready = checkRemoteSiteStatus(ctx, mirrorStatus) } - if resyncRequired(localStatus) { - err = rbdVol.resyncImage() - if err != nil { - log.ErrorLog(ctx, err.Error()) - - return nil, status.Error(codes.Internal, err.Error()) - } + err = resyncVolume(localStatus, rbdVol, req.Force) + if err != nil { + log.ErrorLog(ctx, err.Error()) - // If we issued a resync, return a non-final error as image needs to be recreated - // locally. Caller retries till RBD syncs an initial version of the image to - // report its status in the resync request. - return nil, status.Error(codes.Unavailable, "awaiting initial resync due to split brain") + return nil, err } err = checkVolumeResyncStatus(localStatus) @@ -886,6 +879,29 @@ func (rs *ReplicationServer) ResyncVolume(ctx context.Context, return resp, nil } +func resyncVolume(localStatus librbd.SiteMirrorImageStatus, rbdVol *rbdVolume, force bool) error { + if resyncRequired(localStatus) { + // If the force option is not set return the error message to retry + // with Force option. + if !force { + return status.Errorf(codes.FailedPrecondition, + "image is in %q state, description (%s). Force resync to recover volume", + localStatus.State, localStatus.Description) + } + err := rbdVol.resyncImage() + if err != nil { + return status.Error(codes.Internal, err.Error()) + } + + // If we issued a resync, return a non-final error as image needs to be recreated + // locally. Caller retries till RBD syncs an initial version of the image to + // report its status in the resync request. + return status.Error(codes.Unavailable, "awaiting initial resync due to split brain") + } + + return nil +} + func checkVolumeResyncStatus(localStatus librbd.SiteMirrorImageStatus) error { // we are considering 2 states to check resync started and resync completed // as below. all other states will be considered as an error state so that