Skip to content

Commit

Permalink
Automatically retry the build if encountered remote cache eviction error
Browse files Browse the repository at this point in the history
With bazelbuild#17358, Bazel will exit with code 39 if remote cache evicts blobs during the build. With bazelbuild#17462 and bazelbuild#17747, Bazel is able to continue the build without bazel clean or bazel shutdown.

However, even with bazelbuild#17639 and following changes to extend the lease, remote cache can still evict blobs in some rare cases.

Based on above changes, this PR makes bazel retry the invocation if it encountered the remote cache eviction error during previous invocation if `--experimental_remote_cache_eviction_retries` is set, or **build rewinding**.

```
$ bazel build --experimental_remote_cache_eviction_retries=5 ...
INFO: Invocation ID: b7348bfa-9446-4c72-a888-0a0ad012f225
Loading:
Loading:
Loading: 0 packages loaded
Analyzing: target //a:bar (0 packages loaded, 0 targets configured)
INFO: Analyzed target //a:bar (0 packages loaded, 0 targets configured).
INFO: Found 1 target...
[0 / 2] [Prepa] BazelWorkspaceStatusAction stable-status.txt
ERROR: .../workspace/a/BUILD:8:8: Executing genrule //a:bar failed: Failed to fetch blobs because they do not exist remotely: Missing digest: b5bb9d8014a0f9b1d61e21e796d78dccdf1352f23cd32812f4850b878ae4944c/4
Target //a:bar failed to build
Use --verbose_failures to see the command lines of failed build steps.
INFO: Elapsed time: 0.447s, Critical Path: 0.05s
INFO: 2 processes: 2 internal.
ERROR: Build did NOT complete successfully
Found remote cache eviction error, retrying the build...
INFO: Invocation ID: 983f60dc-8bb9-4b82-aa33-a378469ce140
Loading:
Loading:
Loading: 0 packages loaded
Analyzing: target //a:bar (0 packages loaded, 0 targets configured)
INFO: Analyzed target //a:bar (0 packages loaded, 0 targets configured).
INFO: Found 1 target...
[0 / 2] [Prepa] BazelWorkspaceStatusAction stable-status.txt
Target //a:bar up-to-date:
  bazel-bin/a/bar.out
INFO: Elapsed time: 0.866s, Critical Path: 0.35s
INFO: 3 processes: 1 internal, 1 processwrapper-sandbox, 1 remote.
INFO: Build completed successfully, 3 total actions
$
```

Part of bazelbuild#16660.

Closes bazelbuild#17711.

PiperOrigin-RevId: 520610524
Change-Id: I20d43d1968767a03250b9c8f8a6dda4e056d4f52
  • Loading branch information
coeuvre authored and ShreeM01 committed Mar 31, 2023
1 parent 3ea18cc commit 7b455eb
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -243,12 +243,33 @@ public int getId() {
public ListenableFuture<Void> prefetchInputs()
throws IOException, ForbiddenActionInputException {
if (Spawns.shouldPrefetchInputsForLocalExecution(spawn)) {
return actionExecutionContext
.getActionInputPrefetcher()
.prefetchFiles(
getInputMapping(PathFragment.EMPTY_FRAGMENT, /* willAccessRepeatedly= */ true)
.values(),
getMetadataProvider());
return Futures.catchingAsync(
actionExecutionContext
.getActionInputPrefetcher()
.prefetchFiles(
getInputMapping(PathFragment.EMPTY_FRAGMENT, /* willAccessRepeatedly= */ true)
.values(),
getMetadataProvider(),
Priority.MEDIUM),
BulkTransferException.class,
(BulkTransferException e) -> {
if (BulkTransferException.allCausedByCacheNotFoundException(e)) {
var code =
(executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0)
? Code.REMOTE_CACHE_EVICTED
: Code.REMOTE_CACHE_FAILED;
throw new EnvironmentalExecException(
e,
FailureDetail.newBuilder()
.setMessage("Failed to fetch blobs because they do not exist remotely.")
.setSpawn(FailureDetails.Spawn.newBuilder().setCode(code))
.build());
} else {
throw e;
}
},
directExecutor());
}

return immediateVoidFuture();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,28 @@ public boolean usingLocalTestJobs() {
+ "test log. Otherwise, Bazel generates a test.xml as part of the test action.")
public boolean splitXmlGeneration;

@Option(
name = "incompatible_remote_use_new_exit_code_for_lost_inputs",
defaultValue = "true",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.UNKNOWN},
metadataTags = {OptionMetadataTag.INCOMPATIBLE_CHANGE},
help =
"If set to true, Bazel will use new exit code 39 instead of 34 if remote cache evicts"
+ " blobs during the build.")
public boolean useNewExitCodeForLostInputs;

@Option(
name = "experimental_remote_cache_eviction_retries",
defaultValue = "0",
documentationCategory = OptionDocumentationCategory.REMOTE,
effectTags = {OptionEffectTag.EXECUTION},
help =
"The maximum number of attempts to retry if the build encountered remote cache eviction"
+ " error. A non-zero value will implicitly set"
+ " --incompatible_remote_use_new_exit_code_for_lost_inputs to true.")
public int remoteRetryOnCacheEviction;

/** An enum for specifying different formats of test output. */
public enum TestOutputFormat {
SUMMARY, // Provide summary output only.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,8 @@ private SpawnResult handleError(
catastrophe = true;
} else if (remoteCacheFailed) {
status = Status.REMOTE_CACHE_FAILED;
if (remoteOptions.useNewExitCodeForLostInputs) {
if (executionOptions.useNewExitCodeForLostInputs
|| executionOptions.remoteRetryOnCacheEviction > 0) {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_EVICTED;
} else {
detailedCode = FailureDetails.Spawn.Code.REMOTE_CACHE_FAILED;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import com.google.devtools.build.lib.events.PrintingEventHandler;
import com.google.devtools.build.lib.events.Reporter;
import com.google.devtools.build.lib.events.StoredEventHandler;
import com.google.devtools.build.lib.exec.ExecutionOptions;
import com.google.devtools.build.lib.profiler.MemoryProfiler;
import com.google.devtools.build.lib.profiler.Profiler;
import com.google.devtools.build.lib.profiler.SilentCloseable;
Expand All @@ -54,6 +55,7 @@
import com.google.devtools.build.lib.util.AnsiStrippingOutputStream;
import com.google.devtools.build.lib.util.DebugLoggerConfigurator;
import com.google.devtools.build.lib.util.DetailedExitCode;
import com.google.devtools.build.lib.util.ExitCode;
import com.google.devtools.build.lib.util.InterruptedFailureDetails;
import com.google.devtools.build.lib.util.LoggingUtil;
import com.google.devtools.build.lib.util.Pair;
Expand Down Expand Up @@ -230,18 +232,29 @@ public BlazeCommandResult exec(
return createDetailedCommandResult(
retrievedShutdownReason, FailureDetails.Command.Code.PREVIOUSLY_SHUTDOWN);
}
BlazeCommandResult result =
execExclusively(
originalCommandLine,
invocationPolicy,
args,
outErr,
firstContactTimeMillis,
commandName,
command,
waitTimeInMs,
startupOptionsTaggedWithBazelRc,
commandExtensions);
BlazeCommandResult result;
int attempt = 0;
while (true) {
try {
result =
execExclusively(
originalCommandLine,
invocationPolicy,
args,
outErr,
firstContactTimeMillis,
commandName,
command,
waitTimeInMs,
startupOptionsTaggedWithBazelRc,
commandExtensions,
attempt);
break;
} catch (RemoteCacheEvictedException e) {
outErr.printErrLn("Found remote cache eviction error, retrying the build...");
attempt += 1;
}
}
if (result.shutdown()) {
setShutdownReason(
"Server shut down "
Expand Down Expand Up @@ -289,7 +302,9 @@ private BlazeCommandResult execExclusively(
BlazeCommand command,
long waitTimeInMs,
Optional<List<Pair<String, String>>> startupOptionsTaggedWithBazelRc,
List<Any> commandExtensions) {
List<Any> commandExtensions,
int attempt)
throws RemoteCacheEvictedException {
// Record the start time for the profiler. Do not put anything before this!
long execStartTimeNanos = runtime.getClock().nanoTime();

Expand Down Expand Up @@ -631,7 +646,18 @@ private BlazeCommandResult execExclusively(
}

needToCallAfterCommand = false;
return runtime.afterCommand(env, result);
var newResult = runtime.afterCommand(env, result);
if (newResult.getExitCode().equals(ExitCode.REMOTE_CACHE_EVICTED)) {
var executionOptions =
Preconditions.checkNotNull(options.getOptions(ExecutionOptions.class));
if (attempt < executionOptions.remoteRetryOnCacheEviction) {
throw new RemoteCacheEvictedException();
}
}

return newResult;
} catch (RemoteCacheEvictedException e) {
throw e;
} catch (Throwable e) {
logger.atSevere().withCause(e).log("Shutting down due to exception");
Crash crash = Crash.from(e);
Expand Down Expand Up @@ -665,6 +691,8 @@ private BlazeCommandResult execExclusively(
}
}

private static class RemoteCacheEvictedException extends IOException {}

private static void replayEarlyExitEvents(
OutErr outErr,
BlazeOptionHandler optionHandler,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,7 @@ public void remoteCacheEvictBlobs_whenPrefetchingInput_exitWithCode39() throws E
// Assert: Exit code is 39
assertThat(error)
.hasMessageThat()
.contains(
"Build without the Bytes does not work if your remote cache evicts blobs"
+ " during builds");
.contains("Failed to fetch blobs because they do not exist remotely");
assertThat(error).hasMessageThat().contains(String.format("%s/%s", hashCode, bytes.length));
assertThat(error.getDetailedExitCode().getExitCode().getNumericExitCode()).isEqualTo(39);
}
Expand Down
59 changes: 59 additions & 0 deletions src/test/shell/bazel/remote/build_without_the_bytes_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1627,4 +1627,63 @@ end_of_record"
expect_log "$expected_result"
}

function test_remote_cache_eviction_retries() {
mkdir -p a

cat > a/BUILD <<'EOF'
genrule(
name = 'foo',
srcs = ['foo.in'],
outs = ['foo.out'],
cmd = 'cat $(SRCS) > $@',
)
genrule(
name = 'bar',
srcs = ['foo.out', 'bar.in'],
outs = ['bar.out'],
cmd = 'cat $(SRCS) > $@',
tags = ['no-remote-exec'],
)
EOF

echo foo > a/foo.in
echo bar > a/bar.in

# Populate remote cache
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

bazel clean

# Clean build, foo.out isn't downloaded
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
//a:bar >& $TEST_log || fail "Failed to build"

if [[ -f bazel-bin/a/foo.out ]]; then
fail "Expected intermediate output bazel-bin/a/foo.out to not be downloaded"
fi

# Evict blobs from remote cache
stop_worker
start_worker

echo "updated bar" > a/bar.in

# Incremental build triggers remote cache eviction error but Bazel
# automatically retries the build and reruns the generating actions for
# missing blobs
bazel build \
--remote_executor=grpc://localhost:${worker_port} \
--remote_download_minimal \
--experimental_remote_cache_eviction_retries=5 \
//a:bar >& $TEST_log || fail "Failed to build"

expect_log "Found remote cache eviction error, retrying the build..."
}

run_suite "Build without the Bytes tests"

0 comments on commit 7b455eb

Please sign in to comment.