From 0e2a077515dc94a81eb22897c175de4d9c028a4d Mon Sep 17 00:00:00 2001 From: Gene Su Date: Thu, 15 Jun 2023 09:16:57 -0700 Subject: [PATCH 1/4] [serve] add the missing early return when there are still in progress health check before trying to recreating a new one issue: https://github.com/ray-project/ray/issues/36458 Signed-off-by: Gene Su --- python/ray/serve/_private/http_state.py | 1 + python/ray/serve/tests/test_http_state.py | 49 ++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/python/ray/serve/_private/http_state.py b/python/ray/serve/_private/http_state.py index 6f4b6640581f..e3d3636c1384 100644 --- a/python/ray/serve/_private/http_state.py +++ b/python/ray/serve/_private/http_state.py @@ -191,6 +191,7 @@ def update(self): f"{self._node_id} after {DEFAULT_HEALTH_CHECK_TIMEOUT_S}s" ) self.try_update_status(HTTPProxyStatus.UNHEALTHY) + return # If there's no active in-progress health check and it has been more than 10 # seconds since the last health check, perform another health check. diff --git a/python/ray/serve/tests/test_http_state.py b/python/ray/serve/tests/test_http_state.py index d96fb787057e..f3f1f993ac79 100644 --- a/python/ray/serve/tests/test_http_state.py +++ b/python/ray/serve/tests/test_http_state.py @@ -418,9 +418,54 @@ async def check_health(self): @patch("ray.serve._private.http_state.DEFAULT_HEALTH_CHECK_TIMEOUT_S", 0.1) @patch("ray.serve._private.http_state.PROXY_HEALTH_CHECK_PERIOD_S", 0.1) -def test_http_proxy_state_update_healthy_check_health_always_timeout(): +def test_http_proxy_state_check_health_always_timeout_timtout_eq_period(): """Test calling update method on HTTPProxyState when the proxy state is HEALTHY and - when the ready call always timed out. + when the ready call always timed out and health check timeout and period equals. + + The proxy state started with STARTING. After update is called and ready call + succeeded, the status will change to HEALTHY. After the next few check_health calls + takes very long time to finished, the status will eventually change to UNHEALTHY + after all retries have exhausted. + """ + + @ray.remote(num_cpus=0) + class NewMockHTTPProxyActor: + async def ready(self): + return json.dumps(["mock_worker_id", "mock_log_file_path"]) + + async def check_health(self): + await asyncio.sleep(100) + + proxy_state = _create_http_proxy_state(proxy_actor_class=NewMockHTTPProxyActor) + + # Continuously trigger update. The status should change from STARTING to HEALTHY + # when ready. + wait_for_condition( + condition_predictor=_update_and_check_proxy_status, + state=proxy_state, + status=HTTPProxyStatus.HEALTHY, + ) + first_check_time = proxy_state._last_health_check_time + + # Continuously trigger update and status should change to UNHEALTHY. + wait_for_condition( + condition_predictor=_update_and_check_proxy_status, + state=proxy_state, + status=HTTPProxyStatus.UNHEALTHY, + ) + + # Ensure the check time have changed since the last update + assert first_check_time != proxy_state._last_health_check_time + + # Ensure _consecutive_health_check_failures is correct + assert proxy_state._consecutive_health_check_failures == 3 + + +@patch("ray.serve._private.http_state.DEFAULT_HEALTH_CHECK_TIMEOUT_S", 1) +@patch("ray.serve._private.http_state.PROXY_HEALTH_CHECK_PERIOD_S", 0.1) +def test_http_proxy_state_check_health_always_timeout_timtout_greater_than_period(): + """Test calling update method on HTTPProxyState when the proxy state is HEALTHY and + when the ready call always timed out and health check timeout greater than period. The proxy state started with STARTING. After update is called and ready call succeeded, the status will change to HEALTHY. After the next few check_health calls From 22a6261f7511dbf6bfc34d4fd03f931822772d87 Mon Sep 17 00:00:00 2001 From: Gene Su Date: Thu, 15 Jun 2023 10:54:25 -0700 Subject: [PATCH 2/4] using explicit else block for return and add comment for the reason for the return Signed-off-by: Gene Su --- python/ray/serve/_private/http_state.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/ray/serve/_private/http_state.py b/python/ray/serve/_private/http_state.py index e3d3636c1384..3ce02692c2b0 100644 --- a/python/ray/serve/_private/http_state.py +++ b/python/ray/serve/_private/http_state.py @@ -191,7 +191,14 @@ def update(self): f"{self._node_id} after {DEFAULT_HEALTH_CHECK_TIMEOUT_S}s" ) self.try_update_status(HTTPProxyStatus.UNHEALTHY) - return + else: + # This return is important to not trigger a new health check when + # there is an in progress health check. When the health check object + # is still in progress and before the timeout is triggered, we will + # do an early return here to signal the completion of this update call + # and to prevent another health check object from recreated in the + # code below. + return # If there's no active in-progress health check and it has been more than 10 # seconds since the last health check, perform another health check. From ec33206d679b9410fcbd421e4e748bd40bfaa414 Mon Sep 17 00:00:00 2001 From: Gene Der Su Date: Thu, 15 Jun 2023 12:02:38 -0700 Subject: [PATCH 3/4] Update python/ray/serve/tests/test_http_state.py Co-authored-by: Edward Oakes Signed-off-by: Gene Der Su --- python/ray/serve/tests/test_http_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/tests/test_http_state.py b/python/ray/serve/tests/test_http_state.py index f3f1f993ac79..ee0f7eedfa3b 100644 --- a/python/ray/serve/tests/test_http_state.py +++ b/python/ray/serve/tests/test_http_state.py @@ -463,7 +463,7 @@ async def check_health(self): @patch("ray.serve._private.http_state.DEFAULT_HEALTH_CHECK_TIMEOUT_S", 1) @patch("ray.serve._private.http_state.PROXY_HEALTH_CHECK_PERIOD_S", 0.1) -def test_http_proxy_state_check_health_always_timeout_timtout_greater_than_period(): +def test_http_proxy_state_check_health_always_timeout_timeout_greater_than_period(): """Test calling update method on HTTPProxyState when the proxy state is HEALTHY and when the ready call always timed out and health check timeout greater than period. From 84c2d9a315f9b555c65d6c6be32bff3610c19bd1 Mon Sep 17 00:00:00 2001 From: Gene Der Su Date: Thu, 15 Jun 2023 12:02:53 -0700 Subject: [PATCH 4/4] Update python/ray/serve/tests/test_http_state.py Co-authored-by: Edward Oakes Signed-off-by: Gene Der Su --- python/ray/serve/tests/test_http_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/serve/tests/test_http_state.py b/python/ray/serve/tests/test_http_state.py index ee0f7eedfa3b..1c51e26b2fdc 100644 --- a/python/ray/serve/tests/test_http_state.py +++ b/python/ray/serve/tests/test_http_state.py @@ -418,7 +418,7 @@ async def check_health(self): @patch("ray.serve._private.http_state.DEFAULT_HEALTH_CHECK_TIMEOUT_S", 0.1) @patch("ray.serve._private.http_state.PROXY_HEALTH_CHECK_PERIOD_S", 0.1) -def test_http_proxy_state_check_health_always_timeout_timtout_eq_period(): +def test_http_proxy_state_check_health_always_timeout_timeout_eq_period(): """Test calling update method on HTTPProxyState when the proxy state is HEALTHY and when the ready call always timed out and health check timeout and period equals.