[Serve] Add ray serve request timeout to config (ray-project#36107)

Currently the only way to config the http request timeout is through setting up RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S (or yet to deprecated SERVE_REQUEST_PROCESSING_TIMEOUT_S) environment variables. This PR adds new config request_processing_timeout to the http_options in serve config and use it as the timeout in the http requests. added RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S deprecation message added request_processing_timeout to HTTPOptionsSchema for documentation should also generated at https://docs.ray.io/en/master/serve/api/doc/ray.serve.schema.HTTPOptionsSchema.html#ray.serve.schema.HTTPOptionsSchema.request_processing_timeout when merged Fixed a small bug in when running serve run with config files, the http_options are not picked up Also did a manual test locally running serve run test_config.yaml and seeing the request_processing_timeout in the config getting respected --------- Signed-off-by: Gene Su <e870252314@gmail.com> Signed-off-by: Gene Der Su <gdsu@ucdavis.edu> Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com> Signed-off-by: e428265 <arvind.chandramouli@lmco.com>
lmco · Aug 31, 2023 · af086f1 · af086f1
1 parent a9a9933
commit af086f1
Show file tree

Hide file tree

Showing 10 changed files with 113 additions and 15 deletions.
diff --git a/doc/source/serve/advanced-guides/performance.md b/doc/source/serve/advanced-guides/performance.md
@@ -123,7 +123,7 @@ proper backpressure. You can increase the value in the deployment decorator; e.g
 
 By default, Serve lets client HTTP requests run to completion no matter how long they take. However, slow requests could bottleneck the replica processing, blocking other requests that are waiting. It's recommended that you set an end-to-end timeout, so slow requests can be terminated and retried at another replica.
 
-You can set an end-to-end timeout for HTTP requests by setting the `RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S` environment variable. HTTP Proxies will wait for that many seconds before terminating an HTTP request and retrying it at another replica. This environment variable should be set on every node in your Ray cluster, and it cannot be updated during runtime.
+You can set an end-to-end timeout for HTTP requests by setting the `request_timeout_s` in the `http_options` field of the Serve config. HTTP Proxies will wait for that many seconds before terminating an HTTP request and retrying it at another replica. This config is global to your Ray cluster, and it cannot be updated during runtime.
 
 (serve-performance-http-retry)=
 ### Set request retry times

diff --git a/python/ray/serve/_private/http_proxy.py b/python/ray/serve/_private/http_proxy.py
@@ -71,9 +71,11 @@
 if os.environ.get("SERVE_REQUEST_PROCESSING_TIMEOUT_S") is not None:
     logger.warning(
         "The `SERVE_REQUEST_PROCESSING_TIMEOUT_S` environment variable has "
-        "been deprecated. Please use `RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S` "
-        "instead. `SERVE_REQUEST_PROCESSING_TIMEOUT_S` will be ignored in "
-        "future versions."
+        "been deprecated. Please set `request_timeout_s` in your Serve config's "
+        "`http_options` field instead. `SERVE_REQUEST_PROCESSING_TIMEOUT_S` will be "
+        "ignored in future versions. See: https://docs.ray.io/en/releases-2.5.1/serve/a"
+        "pi/doc/ray.serve.schema.HTTPOptionsSchema.html#ray.serve.schema.HTTPOptionsSch"
+        "ema.request_timeout_s"
     )
 
 
@@ -167,7 +169,9 @@ class HTTPProxy:
     >>> uvicorn.run(HTTPProxy(controller_name)) # doctest: +SKIP
     """
 
-    def __init__(self, controller_name: str):
+    def __init__(self, controller_name: str, request_timeout_s: Optional[float] = None):
+        self.request_timeout_s = request_timeout_s
+
         # Set the controller name so that serve will connect to the
         # controller instance this proxy is running in.
         ray.serve.context._set_internal_replica_context(
@@ -477,14 +481,14 @@ async def send_request_to_replica_unary(
                 # check if latency drops significantly. See
                 # https://github.com/ray-project/ray/pull/29534 for more info.
                 _, request_timed_out = await asyncio.wait(
-                    [object_ref], timeout=RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S
+                    [object_ref], timeout=self.request_timeout_s
                 )
                 if request_timed_out:
                     logger.info(
-                        "Request didn't finish within "
-                        f"{RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S} seconds. Retrying "
-                        "with another replica. You can modify this timeout by "
-                        'setting the "RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S" env var.'
+                        f"Request didn't finish within {self.request_timeout_s} seconds"
+                        ". Retrying with another replica. You can modify this timeout "
+                        'by setting "request_timeout_s" in your Serve config\'s '
+                        "`http_options` field."
                     )
                     backoff = True
                 else:
@@ -614,6 +618,7 @@ def __init__(
         root_path: str,
         controller_name: str,
         node_ip_address: str,
+        request_timeout_s: Optional[float] = None,
         http_middlewares: Optional[List["starlette.middleware.Middleware"]] = None,
     ):  # noqa: F821
         configure_component_logger(
@@ -642,7 +647,12 @@ def __init__(
 
         self.setup_complete = asyncio.Event()
 
-        self.app = HTTPProxy(controller_name)
+        self.app = HTTPProxy(
+            controller_name=controller_name,
+            request_timeout_s=(
+                request_timeout_s or RAY_SERVE_REQUEST_PROCESSING_TIMEOUT_S
+            ),
+        )
 
         self.wrapped_app = self.app
 

diff --git a/python/ray/serve/_private/http_state.py b/python/ray/serve/_private/http_state.py
@@ -340,6 +340,7 @@ def _start_proxy(
             controller_name=self._controller_name,
             node_ip_address=node_ip_address,
             http_middlewares=self._config.middlewares,
+            request_timeout_s=self._config.request_timeout_s,
         )
         return proxy
 

diff --git a/python/ray/serve/config.py b/python/ray/serve/config.py
@@ -560,6 +560,7 @@ class HTTPOptions(pydantic.BaseModel):
     root_path: str = ""
     fixed_number_replicas: Optional[int] = None
     fixed_number_selection_seed: int = 0
+    request_timeout_s: Optional[float] = None
 
     @validator("location", always=True)
     def location_backfill_no_server(cls, v, values):

diff --git a/python/ray/serve/controller.py b/python/ray/serve/controller.py
@@ -722,6 +722,7 @@ def get_serve_instance_details(self) -> Dict:
             http_options=HTTPOptionsSchema(
                 host=http_config.host,
                 port=http_config.port,
+                request_timeout_s=http_config.request_timeout_s,
             ),
             http_proxies=self.http_state.get_http_proxy_details()
             if self.http_state

diff --git a/python/ray/serve/schema.py b/python/ray/serve/schema.py
@@ -512,6 +512,10 @@ class HTTPOptionsSchema(BaseModel, extra=Extra.forbid):
             'deployment routes will be prefixed with this path. Defaults to "".'
         ),
     )
+    request_timeout_s: float = Field(
+        default=None,
+        description="The timeout for HTTP requests. Defaults to no timeout.",
+    )
 
 
 @PublicAPI(stability="alpha")

diff --git a/python/ray/serve/scripts.py b/python/ray/serve/scripts.py
@@ -429,10 +429,13 @@ def run(
             "need to call `ray.init` in your code when using `serve run`."
         )
 
-    client = _private_api.serve_start(
-        detached=True,
-        http_options={"host": host, "port": port, "location": "EveryNode"},
-    )
+    http_options = {"host": host, "port": port, "location": "EveryNode"}
+    # Merge http_options with the ones on ServeDeploySchema. If host and/or port is
+    # passed by cli, those continue to take the priority
+    if is_config and isinstance(config, ServeDeploySchema):
+        config_http_options = config.http_options.dict()
+        http_options = {**config_http_options, **http_options}
+    client = _private_api.serve_start(detached=True, http_options=http_options)
 
     try:
         if is_config:

diff --git a/python/ray/serve/tests/test_cli.py b/python/ray/serve/tests/test_cli.py
@@ -29,6 +29,7 @@
     SERVE_DEFAULT_APP_NAME,
     DEPLOYMENT_NAME_PREFIX_SEPARATOR,
 )
+from ray.serve.tests.conftest import check_ray_stop
 
 CONNECTION_ERROR_MSG = "connection error"
 
@@ -1417,5 +1418,63 @@ def test_run_with_auto_address(
         assert expected_warning_message not in logs
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="File path incorrect on Windows.")
+def test_run_config_request_timeout():
+    """Test running serve with request timeout in http_options.
+
+    The config file has 0.1s as the `request_timeout_s` in the `http_options`. First
+    case checks that when the query runs longer than the 0.1s, the deployment returns a
+    task failed message. The second case checks that when the query takes less than
+    0.1s, the deployment returns a success message.
+    """
+
+    # Set up ray instance to perform 1 retries
+    subprocess.check_output(["ray", "stop", "--force"])
+    wait_for_condition(
+        check_ray_stop,
+        timeout=15,
+    )
+    subprocess.check_output(
+        ["ray", "start", "--head"],
+        env=dict(os.environ, RAY_SERVE_HTTP_REQUEST_MAX_RETRIES="1"),
+    )
+    wait_for_condition(
+        lambda: requests.get("http://localhost:52365/api/ray/version").status_code
+        == 200,
+        timeout=15,
+    )
+
+    config_file_name = os.path.join(
+        os.path.dirname(__file__),
+        "test_config_files",
+        "http_option_request_timeout_s.yaml",
+    )
+    p = subprocess.Popen(["serve", "run", config_file_name])
+
+    # Ensure the http request is killed and failed when the deployment runs longer than
+    # the 0.1 request_timeout_s set in in the config yaml
+    wait_for_condition(
+        lambda: requests.get("http://localhost:8000/app1?sleep_s=0.11").text
+        == "Task failed with 1 retries.",
+    )
+
+    # Ensure the http request returned the correct response when the deployment runs
+    # shorter than the 0.1 request_timeout_s set up in the config yaml
+    wait_for_condition(
+        lambda: requests.get("http://localhost:8000/app1?sleep_s=0.09").text
+        == "Task Succeeded!",
+    )
+
+    p.send_signal(signal.SIGINT)
+    p.wait()
+
+    # Stop ray instance
+    subprocess.check_output(["ray", "stop", "--force"])
+    wait_for_condition(
+        check_ray_stop,
+        timeout=15,
+    )
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))
diff --git a/python/ray/serve/tests/test_config_files/http_option_app_sleeps.py b/python/ray/serve/tests/test_config_files/http_option_app_sleeps.py
@@ -0,0 +1,13 @@
+import time
+from ray import serve
+
+
+@serve.deployment
+def sleep(request):
+    sleep_s = float(request.query_params.get("sleep_s", 0))
+    print(f"sleep_s: {sleep_s}")
+    time.sleep(sleep_s)
+    return "Task Succeeded!"
+
+
+sleep_node = sleep.bind()
diff --git a/python/ray/serve/tests/test_config_files/http_option_request_timeout_s.yaml b/python/ray/serve/tests/test_config_files/http_option_request_timeout_s.yaml
@@ -0,0 +1,6 @@
+http_options:
+  request_timeout_s: 0.1
+applications:
+  - name: "app1"
+    import_path: ray.serve.tests.test_config_files.http_option_app_sleeps.sleep_node
+    route_prefix: /app1