Skip to content

Commit

Permalink
[backoff] fix tests to account for jitter (grpc#37629)
Browse files Browse the repository at this point in the history
Fix some flakiness caused by fixing jitter in the backoff library in grpc#37595.

In the xDS retry tests, the additional jitter made it such that an extra attempt snuck in before the call deadline, so I adjusted the knobs to ensure that exactly the expected number of attempts fit into the tests.

In the PF test, I rewrote the test to use a connection injector, so that it can more accurately tell the time between the connection attempts, without seeing skew due to the server startup time.

Closes grpc#37629

COPYBARA_INTEGRATE_REVIEW=grpc#37629 from markdroth:xds_retry_backoff_flake_fix f35ac90
PiperOrigin-RevId: 671120341
  • Loading branch information
markdroth authored and copybara-github committed Sep 4, 2024
1 parent 4f8087e commit c83f91e
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 29 deletions.
48 changes: 29 additions & 19 deletions test/cpp/end2end/client_lb_end2end_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -954,39 +954,49 @@ TEST_F(PickFirstTest, SelectsReadyAtStartup) {
}

TEST_F(PickFirstTest, BackOffInitialReconnect) {
StartServers(1);
ChannelArguments args;
constexpr int kInitialBackOffMs = 100;
args.SetInt(GRPC_ARG_INITIAL_RECONNECT_BACKOFF_MS,
kInitialBackOffMs * grpc_test_slowdown_factor());
const std::vector<int> ports = {grpc_pick_unused_port_or_die()};
FakeResolverResponseGeneratorWrapper response_generator;
auto channel = BuildChannel("pick_first", response_generator, args);
auto stub = BuildStub(channel);
response_generator.SetNextResolution(ports);
// Start trying to connect. The channel will report
// TRANSIENT_FAILURE, because the server is not reachable.
const grpc_core::Timestamp t0 = grpc_core::Timestamp::Now();
ASSERT_TRUE(WaitForChannelState(
channel.get(),
[&](grpc_connectivity_state state) {
response_generator.SetNextResolution({servers_[0]->port_});
// Intercept the first two connection attempts.
ConnectionAttemptInjector injector;
auto hold1 = injector.AddHold(servers_[0]->port_);
auto hold2 = injector.AddHold(servers_[0]->port_);
// Start trying to connect.
EXPECT_EQ(channel->GetState(/*try_to_connect=*/true), GRPC_CHANNEL_IDLE);
// When the first connection attempt starts, record the time, then fail the
// attempt.
hold1->Wait();
const grpc_core::Timestamp first_attempt_time = grpc_core::Timestamp::Now();
hold1->Fail(absl::UnavailableError("nope"));
// Wait for the second attempt and see how long it took.
hold2->Wait();
const grpc_core::Duration waited =
grpc_core::Timestamp::Now() - first_attempt_time;
// The channel will transition to TRANSIENT_FAILURE.
EXPECT_TRUE(
WaitForChannelState(channel.get(), [&](grpc_connectivity_state state) {
if (state == GRPC_CHANNEL_TRANSIENT_FAILURE) return true;
EXPECT_THAT(state, ::testing::AnyOf(GRPC_CHANNEL_IDLE,
GRPC_CHANNEL_CONNECTING));
return false;
},
/*try_to_connect=*/true));
// Bring up a server on the chosen port.
StartServers(1, ports);
// Now the channel will become connected.
ASSERT_TRUE(WaitForChannelReady(channel.get()));
}));
// Now let the second attempt complete.
hold2->Resume();
// The channel will transition to READY.
EXPECT_TRUE(WaitForChannelReady(channel.get()));
// Check how long it took.
const grpc_core::Duration waited = grpc_core::Timestamp::Now() - t0;
VLOG(2) << "Waited " << waited.millis() << " milliseconds";
// We should have waited at least kInitialBackOffMs, plus or minus
// jitter. We give a little more leeway on the high end to account
// for things taking a little longer than expected in other threads.
// jitter. Jitter is 0.2, but we give extra leeway to account for
// measurement skew, thread hops, etc.
EXPECT_GE(waited.millis(),
(kInitialBackOffMs * grpc_test_slowdown_factor()) * 0.8);
(kInitialBackOffMs * grpc_test_slowdown_factor()) * 0.7);
EXPECT_LE(waited.millis(),
(kInitialBackOffMs * grpc_test_slowdown_factor()) * 1.3);
}
Expand Down Expand Up @@ -2886,7 +2896,7 @@ class ClientLbAddressTest : public ClientLbEnd2endTest {
grpc_core::CoreConfiguration::Reset();
}

const std::vector<std::string>& addresses_seen() {
std::vector<std::string> addresses_seen() {
grpc_core::MutexLock lock(&mu_);
return addresses_seen_;
}
Expand Down
28 changes: 18 additions & 10 deletions test/cpp/end2end/xds/xds_routing_end2end_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1889,12 +1889,19 @@ TEST_P(LdsRdsTest, XdsRetryPolicyLongBackOff) {
"5xx,cancelled,deadline-exceeded,internal,resource-exhausted,"
"unavailable");
retry_policy->mutable_num_retries()->set_value(kNumRetries);
// Set backoff to 1 second, 1/2 of rpc timeout of 2 second.
// Set base interval to 1.5 seconds. Multiplier is 2, so delay after
// second attempt will be ~3 seconds. However, there is a jitter of 0.2,
// so delay after first attempt will be in the range [1.2, 1.8], and delay
// after second attempt will be in the range [2.4, 3.6]. So the first
// attempt will always complete before the 2.5 second timeout, and the
// second attempt cannot start earlier than 1.2 + 2.4 = 3.6 seconds, which
// is after the 2.5 second timeout. This ensures that there will be
// exactly one retry.
// No need to set max interval; it defaults to 10x base interval.
SetProtoDuration(
grpc_core::Duration::Seconds(1),
grpc_core::Duration::Milliseconds(1500),
retry_policy->mutable_retry_back_off()->mutable_base_interval());
SetRouteConfiguration(balancer_.get(), new_route_config);
// No need to set max interval and just let it be the default of 10x of base.
// We expect 1 retry before the RPC times out with DEADLINE_EXCEEDED.
CheckRpcSendFailure(
DEBUG_LOCATION, StatusCode::DEADLINE_EXCEEDED, "Deadline Exceeded",
Expand All @@ -1921,16 +1928,17 @@ TEST_P(LdsRdsTest, XdsRetryPolicyMaxBackOff) {
"5xx,cancelled,deadline-exceeded,internal,resource-exhausted,"
"unavailable");
retry_policy->mutable_num_retries()->set_value(kNumRetries);
// Set backoff to 1 second.
// Set base interval to 1 second and max backoff to 3 seconds, so the
// delays after the first three attempts will be approximately (1, 2, 3).
// Jitter is 0.2, so the minumum time of the 4th attempt will be
// 0.8 + 1.6 + 2.4 = 4.8 seconds and the max time of the 3rd attempt will
// be 1.2 + 2.4 = 3.6 seconds. With an RPC timeout of 4.2 seconds, we are
// guaranteed to have exactly 3 attempts.
SetProtoDuration(
grpc_core::Duration::Seconds(1),
retry_policy->mutable_retry_back_off()->mutable_base_interval());
// Set max interval to be the same as base, so 2 retries will take 2 seconds
// and both retries will take place before the 2.5 seconds rpc timeout.
// Tested to ensure if max is not set, this test will be the same as
// XdsRetryPolicyLongBackOff and we will only see 1 retry in that case.
SetProtoDuration(
grpc_core::Duration::Seconds(1),
grpc_core::Duration::Seconds(3),
retry_policy->mutable_retry_back_off()->mutable_max_interval());
SetRouteConfiguration(balancer_.get(), new_route_config);
// Send an initial RPC to make sure we get connected (we don't want
Expand All @@ -1940,7 +1948,7 @@ TEST_P(LdsRdsTest, XdsRetryPolicyMaxBackOff) {
// We expect 2 retry before the RPC times out with DEADLINE_EXCEEDED.
CheckRpcSendFailure(
DEBUG_LOCATION, StatusCode::DEADLINE_EXCEEDED, "Deadline Exceeded",
RpcOptions().set_timeout_ms(2500).set_server_expected_error(
RpcOptions().set_timeout_ms(4200).set_server_expected_error(
StatusCode::CANCELLED));
EXPECT_EQ(2 + 1, backends_[0]->backend_service()->request_count());
}
Expand Down

0 comments on commit c83f91e

Please sign in to comment.