Skip to content

Commit

Permalink
Refine waiting pending instance
Browse files Browse the repository at this point in the history
  • Loading branch information
s5u13b committed Jan 15, 2025
1 parent fdfa104 commit 9a57b30
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
14 changes: 10 additions & 4 deletions llumnix/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,10 +614,16 @@ async def _check_deployment_states_loop(self, interval: float) -> None:
async def watch_instance_deployment_states(instance_id: str):
# There might be some delays of calling _init_server_and_instance, so sleep first.
await asyncio.sleep(WATCH_DEPLOYMENT_INTERVAL)
instance_state = list_actors(filters=[("name", "=", get_instance_name(instance_id))])
instance_pending_creation = len(instance_state) == 1 and instance_state[0]["state"] == "PENDING_CREATION"
if instance_pending_creation:
await asyncio.sleep(WATCH_DEPLOYMENT_INTERVAL_PENDING_INSTANCE)
wait_pending_instance_time = 0.0
while True:
instance_state = list_actors(filters=[("name", "=", get_instance_name(instance_id))])
instance_pending_creation = len(instance_state) == 1 and instance_state[0]["state"] == "PENDING_CREATION"
if not instance_pending_creation:
break
await asyncio.sleep(WATCH_DEPLOYMENT_INTERVAL)
wait_pending_instance_time += WATCH_DEPLOYMENT_INTERVAL
if wait_pending_instance_time >= WATCH_DEPLOYMENT_INTERVAL_PENDING_INSTANCE:
break
pg_created, server_alive, instance_alive = self._get_instance_deployment_states(instance_id)
if pg_created and (not server_alive or not instance_alive):
logger.warning("instance {} deployment states incorrect, states: (pg {}, server {}, instance {})"
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_test/global_scheduler/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def test_check_deployment_states_loop_and_auto_scale_up_loop(ray_env, request_ou
kill_server(instance_ids[1])
kill_instance(instance_ids[2])
# Wait for check deployment states, scale down instance and auto scale up.
time.sleep(120.0)
time.sleep(90.0)
num_instances = ray.get(manager.scale_up.remote([], []))
assert num_instances == 4
curr_pgs, curr_servers, curr_instances = ray.get(manager._get_cluster_deployment.remote())
Expand Down

0 comments on commit 9a57b30

Please sign in to comment.