Skip to content

Commit

Permalink
[Serve] Add serve_deployment_replica_healthy gauge to check whether…
Browse files Browse the repository at this point in the history
… deployment replicas are healthy (ray-project#29154)

Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
  • Loading branch information
shrekris-anyscale authored and WeichenXu123 committed Dec 19, 2022
1 parent 5a51244 commit 61a61e6
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 2 deletions.
2 changes: 2 additions & 0 deletions doc/source/serve/production-guide/monitoring.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ The following metrics are exposed by Ray Serve:
- The number of exceptions that have occurred in the deployment.
* - ``serve_deployment_replica_starts`` [**]
- The number of times this replica has been restarted due to failure.
* - ``serve_deployment_replica_healthy``
- Whether this deployment replica is healthy. 1 means healthy, 0 unhealthy.
* - ``serve_deployment_processing_latency_ms`` [**]
- The latency for queries to be processed.
* - ``serve_replica_processing_queries`` [**]
Expand Down
19 changes: 17 additions & 2 deletions python/ray/serve/_private/deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@
)
from ray.serve._private.version import DeploymentVersion, VersionedReplica


from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
from ray.util import metrics
from ray._private.gcs_utils import GcsClient
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy

logger = logging.getLogger(SERVE_LOGGER_NAME)

Expand Down Expand Up @@ -983,6 +983,15 @@ def __init__(
self._name, DeploymentStatus.UPDATING
)

self.health_check_gauge = metrics.Gauge(
"serve_deployment_replica_healthy",
description=(
"Tracks whether this deployment replica is healthy. 1 means "
"healthy, 0 means unhealthy."
),
tag_keys=("deployment", "replica"),
)

def should_autoscale(self) -> bool:
"""
Check if the deployment is under autoscaling
Expand Down Expand Up @@ -1489,12 +1498,18 @@ def _check_and_update_replicas(self) -> bool:
for replica in self._replicas.pop(states=[ReplicaState.RUNNING]):
if replica.check_health():
self._replicas.add(ReplicaState.RUNNING, replica)
self.health_check_gauge.set(
1, tags={"deployment": self._name, "replica": replica.replica_tag}
)
else:
running_replicas_changed = True
logger.warning(
f"Replica {replica.replica_tag} of deployment "
f"{self._name} failed health check, stopping it."
)
self.health_check_gauge.set(
0, tags={"deployment": self._name, "replica": replica.replica_tag}
)
replica.stop(graceful=False)
self._replicas.add(ReplicaState.STOPPING, replica)
# If this is a replica of the target version, the deployment
Expand Down
1 change: 1 addition & 0 deletions python/ray/serve/tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def verify_metrics(do_assert=False):
"serve_deployment_processing_latency_ms",
# gauge
"serve_replica_processing_queries",
"serve_deployment_replica_healthy",
# handle
"serve_handle_request_counter",
]
Expand Down

0 comments on commit 61a61e6

Please sign in to comment.