diff --git a/doc/source/serve/production-guide/monitoring.md b/doc/source/serve/production-guide/monitoring.md
index 48c8aa4433c4..410f14ad9c9d 100644
--- a/doc/source/serve/production-guide/monitoring.md
+++ b/doc/source/serve/production-guide/monitoring.md
@@ -232,6 +232,8 @@ The following metrics are exposed by Ray Serve:
      - The number of exceptions that have occurred in the deployment.
    * - ``serve_deployment_replica_starts`` [**]
      - The number of times this replica has been restarted due to failure.
+   * - ``serve_deployment_replica_healthy``
+     - Whether this deployment replica is healthy. 1 means healthy, 0 unhealthy.
    * - ``serve_deployment_processing_latency_ms`` [**]
      - The latency for queries to be processed.
    * - ``serve_replica_processing_queries`` [**]
diff --git a/python/ray/serve/_private/deployment_state.py b/python/ray/serve/_private/deployment_state.py
index df91af5f3170..b432d2eb2992 100644
--- a/python/ray/serve/_private/deployment_state.py
+++ b/python/ray/serve/_private/deployment_state.py
@@ -51,9 +51,9 @@
 )
 from ray.serve._private.version import DeploymentVersion, VersionedReplica
 
-
-from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+from ray.util import metrics
 from ray._private.gcs_utils import GcsClient
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
 
 logger = logging.getLogger(SERVE_LOGGER_NAME)
 
@@ -983,6 +983,15 @@ def __init__(
             self._name, DeploymentStatus.UPDATING
         )
 
+        self.health_check_gauge = metrics.Gauge(
+            "serve_deployment_replica_healthy",
+            description=(
+                "Tracks whether this deployment replica is healthy. 1 means "
+                "healthy, 0 means unhealthy."
+            ),
+            tag_keys=("deployment", "replica"),
+        )
+
     def should_autoscale(self) -> bool:
         """
         Check if the deployment is under autoscaling
@@ -1489,12 +1498,18 @@ def _check_and_update_replicas(self) -> bool:
         for replica in self._replicas.pop(states=[ReplicaState.RUNNING]):
             if replica.check_health():
                 self._replicas.add(ReplicaState.RUNNING, replica)
+                self.health_check_gauge.set(
+                    1, tags={"deployment": self._name, "replica": replica.replica_tag}
+                )
             else:
                 running_replicas_changed = True
                 logger.warning(
                     f"Replica {replica.replica_tag} of deployment "
                     f"{self._name} failed health check, stopping it."
                 )
+                self.health_check_gauge.set(
+                    0, tags={"deployment": self._name, "replica": replica.replica_tag}
+                )
                 replica.stop(graceful=False)
                 self._replicas.add(ReplicaState.STOPPING, replica)
                 # If this is a replica of the target version, the deployment
diff --git a/python/ray/serve/tests/test_metrics.py b/python/ray/serve/tests/test_metrics.py
index c72e840198bb..680d6ba134b8 100644
--- a/python/ray/serve/tests/test_metrics.py
+++ b/python/ray/serve/tests/test_metrics.py
@@ -46,6 +46,7 @@ def verify_metrics(do_assert=False):
             "serve_deployment_processing_latency_ms",
             # gauge
             "serve_replica_processing_queries",
+            "serve_deployment_replica_healthy",
             # handle
             "serve_handle_request_counter",
         ]