Skip to content

Commit

Permalink
[serve] Don't change deployment status when autoscaling (ray-project#…
Browse files Browse the repository at this point in the history
…36520)

The deployment state UPDATING should only be used during redeployment. Right now the state is updating during autoscaling, which can be confusing for users. This PR makes it so that the state doesn't change during autoscaling. This usually means that a deployment's status will remain HEALTHY while it's autoscaling.

Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
  • Loading branch information
zcin authored and arvind-chandra committed Aug 31, 2023
1 parent 40a8648 commit 98eda4f
Show file tree
Hide file tree
Showing 4 changed files with 248 additions and 196 deletions.
10 changes: 9 additions & 1 deletion python/ray/_private/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,14 +509,20 @@ def kill_actor_and_wait_for_failure(actor, timeout=10, retry_interval_ms=100):


def wait_for_condition(
condition_predictor, timeout=10, retry_interval_ms=100, **kwargs: Any
condition_predictor,
timeout=10,
retry_interval_ms=100,
raise_exceptions=False,
**kwargs: Any,
):
"""Wait until a condition is met or time out with an exception.
Args:
condition_predictor: A function that predicts the condition.
timeout: Maximum timeout in seconds.
retry_interval_ms: Retry interval in milliseconds.
raise_exceptions: If true, exceptions that occur while executing
condition_predictor won't be caught and instead will be raised.
Raises:
RuntimeError: If the condition is not met before the timeout expires.
Expand All @@ -528,6 +534,8 @@ def wait_for_condition(
if condition_predictor(**kwargs):
return
except Exception:
if raise_exceptions:
raise
last_ex = ray._private.utils.format_error_message(traceback.format_exc())
time.sleep(retry_interval_ms / 1000.0)
message = "The condition wasn't met before the timeout expired."
Expand Down
26 changes: 19 additions & 7 deletions python/ray/serve/_private/deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,6 +1286,24 @@ def _set_target_state(self, target_info: DeploymentInfo) -> None:

logger.info(f"Deploying new version of deployment {self._name}.")

def _set_target_state_autoscaling(self, num_replicas: int) -> None:
"""Update the target number of replicas based on an autoscaling decision.
This differs from _set_target_state because we are updating the
target number of replicas base on an autoscaling decision and
not a redeployment. This only changes the target num_replicas,
and doesn't change the current deployment status.
"""

new_info = copy(self._target_state.info)
new_info.set_autoscaled_num_replicas(num_replicas)
new_info.version = self._target_state.version.code_version

target_state = DeploymentTargetState.from_deployment_info(new_info)

self._save_checkpoint_func(writeahead_checkpoints={self._name: target_state})
self._target_state = target_state

def deploy(self, deployment_info: DeploymentInfo) -> bool:
"""Deploy the deployment.
Expand Down Expand Up @@ -1345,7 +1363,6 @@ def autoscale(
if self._target_state.deleting:
return

curr_info = self._target_state.info
autoscaling_policy = self._target_state.info.autoscaling_policy
decision_num_replicas = autoscaling_policy.get_decision_num_replicas(
curr_target_num_replicas=self._target_state.num_replicas,
Expand All @@ -1362,12 +1379,7 @@ def autoscale(
f"current handle queued queries: {current_handle_queued_queries}."
)

new_config = copy(curr_info)
new_config.set_autoscaled_num_replicas(decision_num_replicas)
if new_config.version is None:
new_config.version = self._target_state.version.code_version

self._set_target_state(new_config)
self._set_target_state_autoscaling(decision_num_replicas)

def delete(self) -> None:
if not self._target_state.deleting:
Expand Down
Loading

0 comments on commit 98eda4f

Please sign in to comment.