From 970dfdc01d3453c83066e6156278d70bade0350c Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Wed, 21 Aug 2024 15:53:01 -0400 Subject: [PATCH] [Frontend] Improve Startup Failure UX (#7716) --- .../entrypoints/openai/test_mp_api_server.py | 29 ++++++++++--------- vllm/entrypoints/openai/api_server.py | 27 +++++++++++++---- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/tests/entrypoints/openai/test_mp_api_server.py b/tests/entrypoints/openai/test_mp_api_server.py index b9fc0c1422b74..fbfe0db19dd03 100644 --- a/tests/entrypoints/openai/test_mp_api_server.py +++ b/tests/entrypoints/openai/test_mp_api_server.py @@ -1,3 +1,5 @@ +import time + import pytest from vllm.entrypoints.openai.api_server import build_async_engine_client @@ -8,19 +10,20 @@ @pytest.mark.asyncio async def test_mp_crash_detection(): - with pytest.raises(RuntimeError) as excinfo: - parser = FlexibleArgumentParser( - description="vLLM's remote OpenAI server.") - parser = make_arg_parser(parser) - args = parser.parse_args([]) - # use an invalid tensor_parallel_size to trigger the - # error in the server - args.tensor_parallel_size = 65536 - - async with build_async_engine_client(args): - pass - assert "The server process died before responding to the readiness probe"\ - in str(excinfo.value) + parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.") + parser = make_arg_parser(parser) + args = parser.parse_args([]) + # use an invalid tensor_parallel_size to trigger the + # error in the server + args.tensor_parallel_size = 65536 + + start = time.perf_counter() + async with build_async_engine_client(args): + pass + end = time.perf_counter() + + assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s " + "if there is an error in the startup.") @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 266bf79dcdd65..94d8525e429ca 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -8,7 +8,7 @@ from argparse import Namespace from contextlib import asynccontextmanager from http import HTTPStatus -from typing import AsyncIterator, Set +from typing import AsyncIterator, Optional, Set from fastapi import APIRouter, FastAPI, Request from fastapi.exceptions import RequestValidationError @@ -60,6 +60,7 @@ openai_serving_tokenization: OpenAIServingTokenization prometheus_multiproc_dir: tempfile.TemporaryDirectory +# Cannot use __name__ (https://github.com/vllm-project/vllm/pull/4765) logger = init_logger('vllm.entrypoints.openai.api_server') _running_tasks: Set[asyncio.Task] = set() @@ -94,7 +95,15 @@ async def _force_log(): @asynccontextmanager async def build_async_engine_client( - args: Namespace) -> AsyncIterator[AsyncEngineClient]: + args: Namespace) -> AsyncIterator[Optional[AsyncEngineClient]]: + """ + Create AsyncEngineClient, either: + - in-process using the AsyncLLMEngine Directly + - multiprocess using AsyncLLMEngine RPC + + Returns the Client or None if the creation failed. + """ + # Context manager to handle async_engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit global engine_args @@ -157,11 +166,13 @@ async def build_async_engine_client( try: await rpc_client.setup() break - except TimeoutError as e: + except TimeoutError: if not rpc_server_process.is_alive(): - raise RuntimeError( - "The server process died before " - "responding to the readiness probe") from e + logger.error( + "RPCServer process died before responding " + "to readiness probe") + yield None + return yield async_engine_client finally: @@ -410,6 +421,10 @@ async def run_server(args, **uvicorn_kwargs) -> None: logger.info("args: %s", args) async with build_async_engine_client(args) as async_engine_client: + # If None, creation of the client failed and we exit. + if async_engine_client is None: + return + app = await init_app(async_engine_client, args) shutdown_task = await serve_http(