Skip to content

Commit

Permalink
Fix race condition in SnapshotBasedIndexRecoveryIT (#79404)
Browse files Browse the repository at this point in the history
If we don't cancel the re-location of the index to the same target
node, it is possible that the recovery is retried, meaning that it's
possible that the available permit is granted to indexRecoveredFromSnapshot1
instead of to indexRecoveredFromSnapshot2.

Relates #79316
Closes #79420
  • Loading branch information
fcofdez authored Nov 29, 2021
1 parent f3b5299 commit 5cb5d92
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ public void beforeIndexShardClosed(ShardId shardId, @Nullable IndexShard indexSh
}

public void startRecovery(final IndexShard indexShard, final DiscoveryNode sourceNode, final RecoveryListener listener) {
final Releasable snapshotFileDownloadsPermit = recoverySettings.tryAcquireSnapshotDownloadPermits();
final Releasable snapshotFileDownloadsPermit = tryAcquireSnapshotDownloadPermits();
// create a new recovery status, and process...
final long recoveryId = onGoingRecoveries.startRecovery(
indexShard,
Expand Down Expand Up @@ -258,6 +258,11 @@ private void doRecovery(final long recoveryId, final StartRecoveryRequest preExi
);
}

// Visible for testing
public Releasable tryAcquireSnapshotDownloadPermits() {
return recoverySettings.tryAcquireSnapshotDownloadPermits();
}

/**
* Prepare the start recovery request.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.core.CheckedRunnable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.MergePolicyConfig;
Expand Down Expand Up @@ -85,6 +86,7 @@
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS;
import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_SNAPSHOT_FILE_DOWNLOADS_PER_NODE;
Expand Down Expand Up @@ -914,7 +916,6 @@ public void testRecoveryUsingSnapshotsIsThrottledPerNode() throws Exception {
);
}

@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/79420")
public void testRecoveryUsingSnapshotsPermitIsReturnedAfterFailureOrCancellation() throws Exception {
executeRecoveryWithSnapshotFileDownloadThrottled(
(
Expand All @@ -930,7 +931,12 @@ public void testRecoveryUsingSnapshotsPermitIsReturnedAfterFailureOrCancellation
client().admin()
.indices()
.prepareUpdateSettings(indexRecoveredFromSnapshot1)
.setSettings(Settings.builder().put("index.routing.allocation.require._name", targetNode))
.setSettings(
Settings.builder()
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)
.put("index.routing.allocation.require._name", (String) null)
.put("index.routing.allocation.include._name", sourceNode + "," + targetNode)
)
.get()
);

Expand Down Expand Up @@ -963,6 +969,16 @@ public void testRecoveryUsingSnapshotsPermitIsReturnedAfterFailureOrCancellation

targetMockTransportService.clearAllRules();
channelRef.get().sendResponse(new IOException("unable to clean files"));
PeerRecoveryTargetService peerRecoveryTargetService = internalCluster().getInstance(
PeerRecoveryTargetService.class,
targetNode
);
assertBusy(() -> {
// Wait until the current RecoveryTarget releases the snapshot download permit
try (Releasable snapshotDownloadPermit = peerRecoveryTargetService.tryAcquireSnapshotDownloadPermits()) {
assertThat(snapshotDownloadPermit, is(notNullValue()));
}
});
}

String indexRecoveredFromSnapshot2 = indices.get(1);
Expand Down Expand Up @@ -1140,10 +1156,11 @@ private void executeRecoveryWithSnapshotFileDownloadThrottled(SnapshotBasedRecov
indexName,
Settings.builder()
.put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1)
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
.put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
.put(IndexService.GLOBAL_CHECKPOINT_SYNC_INTERVAL_SETTING.getKey(), "1s")
.put("index.routing.allocation.require._name", dataNodes.get(0))
.put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)
.put(SETTING_ALLOCATION_MAX_RETRY.getKey(), 0)
.build()
);
indices.add(indexName);
Expand Down

0 comments on commit 5cb5d92

Please sign in to comment.