[backoff] fix tests to account for jitter (grpc#37629)

Fix some flakiness caused by fixing jitter in the backoff library in grpc#37595. In the xDS retry tests, the additional jitter made it such that an extra attempt snuck in before the call deadline, so I adjusted the knobs to ensure that exactly the expected number of attempts fit into the tests. In the PF test, I rewrote the test to use a connection injector, so that it can more accurately tell the time between the connection attempts, without seeing skew due to the server startup time. Closes grpc#37629 COPYBARA_INTEGRATE_REVIEW=grpc#37629 from markdroth:xds_retry_backoff_flake_fix f35ac90 PiperOrigin-RevId: 671120341
paulosjca · Sep 4, 2024 · c83f91e · c83f91e
1 parent 4f8087e
commit c83f91e
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 29 deletions.
diff --git a/test/cpp/end2end/client_lb_end2end_test.cc b/test/cpp/end2end/client_lb_end2end_test.cc
@@ -954,39 +954,49 @@ TEST_F(PickFirstTest, SelectsReadyAtStartup) {
 }
 
 TEST_F(PickFirstTest, BackOffInitialReconnect) {
+  StartServers(1);
   ChannelArguments args;
   constexpr int kInitialBackOffMs = 100;
   args.SetInt(GRPC_ARG_INITIAL_RECONNECT_BACKOFF_MS,
               kInitialBackOffMs * grpc_test_slowdown_factor());
-  const std::vector<int> ports = {grpc_pick_unused_port_or_die()};
   FakeResolverResponseGeneratorWrapper response_generator;
   auto channel = BuildChannel("pick_first", response_generator, args);
   auto stub = BuildStub(channel);
-  response_generator.SetNextResolution(ports);
-  // Start trying to connect.  The channel will report
-  // TRANSIENT_FAILURE, because the server is not reachable.
-  const grpc_core::Timestamp t0 = grpc_core::Timestamp::Now();
-  ASSERT_TRUE(WaitForChannelState(
-      channel.get(),
-      [&](grpc_connectivity_state state) {
+  response_generator.SetNextResolution({servers_[0]->port_});
+  // Intercept the first two connection attempts.
+  ConnectionAttemptInjector injector;
+  auto hold1 = injector.AddHold(servers_[0]->port_);
+  auto hold2 = injector.AddHold(servers_[0]->port_);
+  // Start trying to connect.
+  EXPECT_EQ(channel->GetState(/*try_to_connect=*/true), GRPC_CHANNEL_IDLE);
+  // When the first connection attempt starts, record the time, then fail the
+  // attempt.
+  hold1->Wait();
+  const grpc_core::Timestamp first_attempt_time = grpc_core::Timestamp::Now();
+  hold1->Fail(absl::UnavailableError("nope"));
+  // Wait for the second attempt and see how long it took.
+  hold2->Wait();
+  const grpc_core::Duration waited =
+      grpc_core::Timestamp::Now() - first_attempt_time;
+  // The channel will transition to TRANSIENT_FAILURE.
+  EXPECT_TRUE(
+      WaitForChannelState(channel.get(), [&](grpc_connectivity_state state) {
         if (state == GRPC_CHANNEL_TRANSIENT_FAILURE) return true;
         EXPECT_THAT(state, ::testing::AnyOf(GRPC_CHANNEL_IDLE,
                                             GRPC_CHANNEL_CONNECTING));
         return false;
-      },
-      /*try_to_connect=*/true));
-  // Bring up a server on the chosen port.
-  StartServers(1, ports);
-  // Now the channel will become connected.
-  ASSERT_TRUE(WaitForChannelReady(channel.get()));
+      }));
+  // Now let the second attempt complete.
+  hold2->Resume();
+  // The channel will transition to READY.
+  EXPECT_TRUE(WaitForChannelReady(channel.get()));
   // Check how long it took.
-  const grpc_core::Duration waited = grpc_core::Timestamp::Now() - t0;
   VLOG(2) << "Waited " << waited.millis() << " milliseconds";
   // We should have waited at least kInitialBackOffMs, plus or minus
-  // jitter.  We give a little more leeway on the high end to account
-  // for things taking a little longer than expected in other threads.
+  // jitter.  Jitter is 0.2, but we give extra leeway to account for
+  // measurement skew, thread hops, etc.
   EXPECT_GE(waited.millis(),
-            (kInitialBackOffMs * grpc_test_slowdown_factor()) * 0.8);
+            (kInitialBackOffMs * grpc_test_slowdown_factor()) * 0.7);
   EXPECT_LE(waited.millis(),
             (kInitialBackOffMs * grpc_test_slowdown_factor()) * 1.3);
 }
@@ -2886,7 +2896,7 @@ class ClientLbAddressTest : public ClientLbEnd2endTest {
     grpc_core::CoreConfiguration::Reset();
   }
 
-  const std::vector<std::string>& addresses_seen() {
+  std::vector<std::string> addresses_seen() {
     grpc_core::MutexLock lock(&mu_);
     return addresses_seen_;
   }

diff --git a/test/cpp/end2end/xds/xds_routing_end2end_test.cc b/test/cpp/end2end/xds/xds_routing_end2end_test.cc
@@ -1889,12 +1889,19 @@ TEST_P(LdsRdsTest, XdsRetryPolicyLongBackOff) {
       "5xx,cancelled,deadline-exceeded,internal,resource-exhausted,"
       "unavailable");
   retry_policy->mutable_num_retries()->set_value(kNumRetries);
-  // Set backoff to 1 second, 1/2 of rpc timeout of 2 second.
+  // Set base interval to 1.5 seconds.  Multiplier is 2, so delay after
+  // second attempt will be ~3 seconds.  However, there is a jitter of 0.2,
+  // so delay after first attempt will be in the range [1.2, 1.8], and delay
+  // after second attempt will be in the range [2.4, 3.6].  So the first
+  // attempt will always complete before the 2.5 second timeout, and the
+  // second attempt cannot start earlier than 1.2 + 2.4 = 3.6 seconds, which
+  // is after the 2.5 second timeout.  This ensures that there will be
+  // exactly one retry.
+  // No need to set max interval; it defaults to 10x base interval.
   SetProtoDuration(
-      grpc_core::Duration::Seconds(1),
+      grpc_core::Duration::Milliseconds(1500),
       retry_policy->mutable_retry_back_off()->mutable_base_interval());
   SetRouteConfiguration(balancer_.get(), new_route_config);
-  // No need to set max interval and just let it be the default of 10x of base.
   // We expect 1 retry before the RPC times out with DEADLINE_EXCEEDED.
   CheckRpcSendFailure(
       DEBUG_LOCATION, StatusCode::DEADLINE_EXCEEDED, "Deadline Exceeded",
@@ -1921,16 +1928,17 @@ TEST_P(LdsRdsTest, XdsRetryPolicyMaxBackOff) {
       "5xx,cancelled,deadline-exceeded,internal,resource-exhausted,"
       "unavailable");
   retry_policy->mutable_num_retries()->set_value(kNumRetries);
-  // Set backoff to 1 second.
+  // Set base interval to 1 second and max backoff to 3 seconds, so the
+  // delays after the first three attempts will be approximately (1, 2, 3).
+  // Jitter is 0.2, so the minumum time of the 4th attempt will be
+  // 0.8 + 1.6 + 2.4 = 4.8 seconds and the max time of the 3rd attempt will
+  // be 1.2 + 2.4 = 3.6 seconds.  With an RPC timeout of 4.2 seconds, we are
+  // guaranteed to have exactly 3 attempts.
   SetProtoDuration(
       grpc_core::Duration::Seconds(1),
       retry_policy->mutable_retry_back_off()->mutable_base_interval());
-  // Set max interval to be the same as base, so 2 retries will take 2 seconds
-  // and both retries will take place before the 2.5 seconds rpc timeout.
-  // Tested to ensure if max is not set, this test will be the same as
-  // XdsRetryPolicyLongBackOff and we will only see 1 retry in that case.
   SetProtoDuration(
-      grpc_core::Duration::Seconds(1),
+      grpc_core::Duration::Seconds(3),
       retry_policy->mutable_retry_back_off()->mutable_max_interval());
   SetRouteConfiguration(balancer_.get(), new_route_config);
   // Send an initial RPC to make sure we get connected (we don't want
@@ -1940,7 +1948,7 @@ TEST_P(LdsRdsTest, XdsRetryPolicyMaxBackOff) {
   // We expect 2 retry before the RPC times out with DEADLINE_EXCEEDED.
   CheckRpcSendFailure(
       DEBUG_LOCATION, StatusCode::DEADLINE_EXCEEDED, "Deadline Exceeded",
-      RpcOptions().set_timeout_ms(2500).set_server_expected_error(
+      RpcOptions().set_timeout_ms(4200).set_server_expected_error(
           StatusCode::CANCELLED));
   EXPECT_EQ(2 + 1, backends_[0]->backend_service()->request_count());
 }