From ce277e089807360331286f3fd972bb5cd191d154 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Thu, 26 Dec 2024 18:43:05 -0500 Subject: [PATCH] [2/N] API Server: Avoid ulimit footgun (#11530) Signed-off-by: Bowen Wang --- vllm/entrypoints/api_server.py | 4 +++- vllm/entrypoints/openai/api_server.py | 6 +++++- vllm/utils.py | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 95da1c6e7b9bf..daefbff7e5178 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -21,7 +21,7 @@ from vllm.logger import init_logger from vllm.sampling_params import SamplingParams from vllm.usage.usage_lib import UsageContext -from vllm.utils import FlexibleArgumentParser, random_uuid +from vllm.utils import FlexibleArgumentParser, random_uuid, set_ulimit from vllm.version import __version__ as VLLM_VERSION logger = init_logger("vllm.entrypoints.api_server") @@ -119,6 +119,8 @@ async def run_server(args: Namespace, logger.info("vLLM API server version %s", VLLM_VERSION) logger.info("args: %s", args) + set_ulimit() + app = await init_app(args, llm_engine) assert engine is not None diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 16086689a10d1..2e45b474237f9 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -68,7 +68,7 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address) + is_valid_ipv6_address, set_ulimit) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -727,6 +727,10 @@ async def run_server(args, **uvicorn_kwargs) -> None: sock_addr = (args.host or "", args.port) sock = create_server_socket(sock_addr) + # workaround to avoid footguns where uvicorn drops requests with too + # many concurrent requests active + set_ulimit() + def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing raise KeyboardInterrupt("terminated") diff --git a/vllm/utils.py b/vllm/utils.py index 7d290dcb7dad0..3d198887021dc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -12,6 +12,7 @@ import ipaddress import os import re +import resource import signal import socket import subprocess @@ -1818,3 +1819,20 @@ def memory_profiling( result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes # noqa result.profile_time = diff.timestamp result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa + + +# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre +def set_ulimit(target_soft_limit=65535): + resource_type = resource.RLIMIT_NOFILE + current_soft, current_hard = resource.getrlimit(resource_type) + + if current_soft < target_soft_limit: + try: + resource.setrlimit(resource_type, + (target_soft_limit, current_hard)) + except ValueError as e: + logger.warning( + "Found ulimit of %s and failed to automatically increase" + "with error %s. This can cause fd limit errors like" + "`OSError: [Errno 24] Too many open files`. Consider " + "increasing with ulimit -n", current_soft, e)