From df70e3258c0ff18b5cf33fc4cfefc8aee6039a10 Mon Sep 17 00:00:00 2001
From: Raven Black <ravenblack@dropbox.com>
Date: Mon, 6 May 2024 14:25:01 +0000
Subject: [PATCH 1/2] Increase flake threshold for hotrestart_handoff_test

Signed-off-by: Raven Black <ravenblack@dropbox.com>
---
 test/integration/python/hotrestart_handoff_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/integration/python/hotrestart_handoff_test.py b/test/integration/python/hotrestart_handoff_test.py
index 9627b24b2696..4b7e3b6298e3 100644
--- a/test/integration/python/hotrestart_handoff_test.py
+++ b/test/integration/python/hotrestart_handoff_test.py
@@ -33,7 +33,7 @@ def random_loopback_host():
 # This is a timeout that must be long enough that the hot restarted
 # instance can reliably be fully started up within this many seconds, or the
 # test will be flaky. 3 seconds is enough on a not-busy host with a non-tsan
-# non-coverage build; 10 seconds should be enough to be not flaky in most
+# non-coverage build; 15 seconds should be enough to be not flaky in most
 # configurations.
 #
 # Unfortunately, because the test is verifying the behavior of a connection
@@ -44,7 +44,7 @@ def random_loopback_host():
 # Ideally this would be adjusted (3x) for tsan and coverage runs, but making that
 # possible for python is outside the scope of this test, so we're stuck using the
 # 3x value for all tests.
-STARTUP_TOLERANCE_SECONDS = 10
+STARTUP_TOLERANCE_SECONDS = 15
 
 # We send multiple requests in parallel and require them all to function correctly
 # - this makes it so if something is flaky we're more likely to encounter it, and

From 7668d1b85447c22de7b0efa79565836ffca48138 Mon Sep 17 00:00:00 2001
From: Raven Black <ravenblack@dropbox.com>
Date: Thu, 9 May 2024 17:43:36 +0000
Subject: [PATCH 2/2] Actually *lower* the value instead.

Signed-off-by: Raven Black <ravenblack@dropbox.com>
---
 test/integration/python/BUILD                      |  4 +++-
 test/integration/python/hotrestart_handoff_test.py | 13 ++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/test/integration/python/BUILD b/test/integration/python/BUILD
index 8d824bbdad99..5f1e42517986 100644
--- a/test/integration/python/BUILD
+++ b/test/integration/python/BUILD
@@ -7,7 +7,9 @@ envoy_package()
 
 envoy_py_test(
     name = "hotrestart_handoff_test",
-    size = "medium",
+    # This is not actually large, but setting large makes the test skip for
+    # asan and tsan.
+    size = "large",
     srcs = select({
         "//bazel:disable_hot_restart_or_admin": ["null_test.py"],
         "//conditions:default": ["hotrestart_handoff_test.py"],
diff --git a/test/integration/python/hotrestart_handoff_test.py b/test/integration/python/hotrestart_handoff_test.py
index 4b7e3b6298e3..913ed959940a 100644
--- a/test/integration/python/hotrestart_handoff_test.py
+++ b/test/integration/python/hotrestart_handoff_test.py
@@ -33,7 +33,7 @@ def random_loopback_host():
 # This is a timeout that must be long enough that the hot restarted
 # instance can reliably be fully started up within this many seconds, or the
 # test will be flaky. 3 seconds is enough on a not-busy host with a non-tsan
-# non-coverage build; 15 seconds should be enough to be not flaky in most
+# non-coverage build; 6 seconds should be enough to be not flaky in most
 # configurations.
 #
 # Unfortunately, because the test is verifying the behavior of a connection
@@ -41,10 +41,13 @@ def random_loopback_host():
 # so increasing this value increases the duration of the test. For this
 # reason we want to keep it as low as possible without causing flaky failure.
 #
-# Ideally this would be adjusted (3x) for tsan and coverage runs, but making that
-# possible for python is outside the scope of this test, so we're stuck using the
-# 3x value for all tests.
-STARTUP_TOLERANCE_SECONDS = 15
+# If this goes longer than 10 seconds connections start timing out which
+# causes the test to get stuck and time out. Unfortunately for tsan or asan
+# runs the "long enough to start up" constraint fights with the "too long for
+# connections to be idle" constraint. Ideally this test would be disabled for
+# those slower test contexts, but we don't currently have infrastructure for
+# that.
+STARTUP_TOLERANCE_SECONDS = 6
 
 # We send multiple requests in parallel and require them all to function correctly
 # - this makes it so if something is flaky we're more likely to encounter it, and