vllm-project · robertgshaw2-redhat · Aug 21, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 10, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -86,6 +86,7 @@ steps:
   - vllm/
   commands:
   - pip install -e ./plugins/vllm_add_dummy_model
+  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm
   - pytest -v -s entrypoints/openai
 

@@ -0,0 +1,55 @@
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.58
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "4096", "--enable-chunked-prefill",
+        "--disable-log-requests", "--enforce-eager"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_data(server):
+    return {
+        "url": f"{server.url_for('v1')}/completions",
+    }
+
+
+def test_lm_eval_accuracy(server_data):
+    model_args = (f"model={MODEL_NAME},"
+                  f"base_url={server_data['url']},"
+                  f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+    results = lm_eval.simple_evaluate(
+        model="local-completions",
+        model_args=model_args,
+        tasks=TASK,
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/tests/entrypoints/openai/test_load.py b/tests/entrypoints/openai/test_load.py
@@ -0,0 +1,105 @@
+"""
+This file tests significant load on the vLLM server.
+Inside vLLM, we use a zeromq based RPC protocol
+to enable multiprocessing w/ the API server and 
+the AsyncLLMEngine to avoid GIL conflict.
+
+This test confirms that even at high load with many 
+concurrent requests, zmq does not drop any messages.
+"""
+
+import asyncio
+import json
+
+import aiohttp
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+MODEL_NAME = "Qwen/Qwen2-0.5B-Instruct"
+NUM_REQUESTS = 20000
+MAX_TOKENS = 50
+MESSAGES = [{
+    "role": "system",
+    "content": "you are a helpful assistant"
+}, {
+    "role": "user",
+    "content": "The meaning of life is"
+}]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "4096", "--enable-chunked-prefill",
+        "--disable-log-requests", "--enforce-eager"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_data(server):
+    return {
+        "url": f"{server.url_for('v1')}/chat/completions",
+        "api_key": server.DUMMY_API_KEY
+    }
+
+
+# Cannot use Async OpenAIClient due to limitations in maximum
+# number of concurrent requests that can be sent to the server
+# from the client.
+async def async_openai_chat(model_name, url, api_key):
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": model_name,
+            "messages": MESSAGES,
+            "temperature": 0.0,
+            "max_tokens": MAX_TOKENS,
+            "stream": False,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}"
+        }
+
+        async with session.post(url=url, json=payload,
+                                headers=headers) as response:
+            assert response.status == 200
+            # data = json.loads(response.text)
+            data = json.loads(await response.text())
+            completion_tokens = data["usage"]["completion_tokens"]
+            text = data["choices"][0]["message"]
+
+        return (completion_tokens, text)
+
+
+async def get_request(model_name, url, api_key):
+    for _ in range(NUM_REQUESTS):
+        yield async_openai_chat(model_name, url, api_key)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_load(server_data, model_name):
+    # Make requests to the server.
+    tasks = []
+    async for request in get_request(model_name, server_data["url"],
+                                     server_data["api_key"]):
+        tasks.append(asyncio.create_task(request))
+    outputs = await asyncio.gather(*tasks)
+
+    # Check that each client generated exactly 50 tokens.
+    # If this is true, then we are not seeing any message dropping in zeromq.
+    for idx, (completion_tokens, text) in enumerate(outputs):
+        assert completion_tokens == MAX_TOKENS, (
+            f"Request {idx}: Expected {MAX_TOKENS} completion tokens but "
+            f"found only {completion_tokens} were generated. "
+            f"zeromq multiprocessing frontend is likely dropping messages. "
+            f"Full text:\n\n\n {text}")
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -642,6 +642,11 @@ def is_stopped(self) -> bool:
     def errored(self) -> bool:
         return self._errored_with is not None
 
+    @property
+    def limit_concurrency(self) -> Optional[int]:
+        """Maximum number of concurrently running requests."""
+        return None
+
     def set_errored(self, exc: Exception) -> None:
         self._errored_with = exc
 

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -30,6 +30,10 @@ def is_stopped(self) -> bool:
     def errored(self) -> bool:
         ...
 
+    @property
+    def limit_concurrency(self) -> Optional[int]:
+        """Maximum number of concurrently running requests."""
+
     def generate(
         self,
         inputs: PromptInputs,

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
@@ -26,6 +26,15 @@ async def serve_http(app: FastAPI, engine: AsyncEngineClient,
 
         logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
 
+    # Set concurrency limits in uvicorn if running in multiprocessing mode
+    # since zmq has maximum socket limit of zmq.constants.SOCKET_LIMIT (65536).
+    if engine.limit_concurrency is not None:
+        logger.info(
+            "Launching Uvicorn with --limit_concurrency %s. To avoid this "
+            "limit at the expense of performance run with "
+            "--disable-frontend-multiprocessing", engine.limit_concurrency)
+        uvicorn_kwargs["limit_concurrency"] = engine.limit_concurrency
+
     config = uvicorn.Config(app, **uvicorn_kwargs)
     server = uvicorn.Server(config)
     _add_shutdown_handlers(app, server, engine)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -131,6 +131,9 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
         logger.info("Multiprocessing frontend to use %s for RPC Path.",
                     rpc_path)
 
+        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
+        async_engine_client = AsyncEngineRPCClient(rpc_path)
+
         # Start RPCServer in separate process (holds the AsyncLLMEngine).
         context = multiprocessing.get_context("spawn")
         # the current process might have CUDA context,
@@ -141,8 +144,6 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
         rpc_server_process.start()
         logger.info("Started engine process with PID %d",
                     rpc_server_process.pid)
-        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
-        async_engine_client = AsyncEngineRPCClient(rpc_path)
 
         try:
             while True:

diff --git a/vllm/entrypoints/openai/rpc/__init__.py b/vllm/entrypoints/openai/rpc/__init__.py
@@ -7,8 +7,18 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 
+# Success string used for RPC instructions.
 VLLM_RPC_SUCCESS_STR = "SUCCESS"
-VLLM_RPC_HEALTHY_STR = "HEALTHY"
+
+# Timeouts.
+VLLM_RPC_SERVER_START_TIMEOUT_MS = 1000
+VLLM_RPC_HEALTH_TIMEOUT_MS = 10000
+
+# Minimum value of ZMQ.SOCKET_LIMIT to run mp.
+VLLM_RPC_SOCKET_LIMIT_CUTOFF = 2000
+
+# HWM is set to Infinity.
+VLLM_RPC_ZMQ_HWM = 0
 
 
 @dataclass
@@ -34,7 +44,7 @@ class RPCUtilityRequest(Enum):
     GET_SCHEDULER_CONFIG = 5
     GET_LORA_CONFIG = 6
     DO_LOG_STATS = 7
-    CHECK_HEALTH = 8
+    IS_SERVER_HEALTHY = 8
     IS_TRACING_ENABLED = 9