vllm-project · robertgshaw2-redhat · Aug 21, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 10, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -86,6 +86,7 @@ steps:
   - vllm/
   commands:
   - pip install -e ./plugins/vllm_add_dummy_model
+  - pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
   - pytest -v -s entrypoints/llm
   - pytest -v -s entrypoints/openai
 

@@ -1,14 +1,19 @@
 import asyncio
 import os
+from asyncio import CancelledError
 from dataclasses import dataclass
+from typing import Optional
 
 import pytest
+import pytest_asyncio
 import torch
 
 from vllm import SamplingParams
 from vllm.config import ParallelConfig
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
+from vllm.outputs import RequestOutput as RealRequestOutput
 
+from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
 
 
@@ -118,33 +123,103 @@ async def test_new_requests_event():
     os.environ.pop("VLLM_ALLOW_ENGINE_USE_RAY")
 
 
-def test_asyncio_run():
+def start_engine():
     wait_for_gpu_memory_to_clear(
         devices=list(range(torch.cuda.device_count())),
         threshold_bytes=2 * 2**30,
         timeout_s=60,
     )
 
-    engine = AsyncLLMEngine.from_engine_args(
-        AsyncEngineArgs(model="facebook/opt-125m"))
+    return AsyncLLMEngine.from_engine_args(
+        AsyncEngineArgs(model="facebook/opt-125m", enforce_eager=True))
+
+
+@pytest_asyncio.fixture(scope="module")
+async def async_engine():
+    engine = await asyncio.get_event_loop().run_in_executor(executor=None,
+                                                            func=start_engine)
+    try:
+        yield engine
+    finally:
+        engine.shutdown_background_loop()
+        del engine
+        await asyncio.sleep(0.1)
+        cleanup()
+
+
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    # So we can share the async engine fixture between these tests
+    return False
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_asyncio_run(async_engine):
 
     async def run(prompt: str):
         sampling_params = SamplingParams(
             temperature=0,
             max_tokens=32,
         )
 
-        async for output in engine.generate(prompt,
-                                            sampling_params,
-                                            request_id=prompt):
+        async for output in async_engine.generate(prompt,
+                                                  sampling_params,
+                                                  request_id=prompt):
             final_output = output
         return final_output
 
-    async def generate():
-        return await asyncio.gather(
-            run("test0"),
-            run("test1"),
-        )
-
-    results = asyncio.run(generate())
+    results = await asyncio.gather(
+        run("test0"),
+        run("test1"),
+    )
     assert len(results) == 2
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_cancellation(async_engine):
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+
+    i = 0
+    with pytest.raises(CancelledError):
+        async for output in async_engine.generate("test2",
+                                                  sampling_params,
+                                                  request_id="test2"):
+            assert not output.finished
+            i += 1
+            if i == 5:
+                await async_engine.abort("test2")
+
+    assert i == 5
+
+
+@pytest.mark.asyncio(scope="module")
+async def test_delayed_generator(async_engine):
+    sampling_params = SamplingParams(
+        temperature=0,
+        min_tokens=10,
+        max_tokens=10,
+    )
+
+    stream = async_engine.generate("test3",
+                                   sampling_params,
+                                   request_id="test3")
+    i = 0
+    final_output: Optional[RealRequestOutput] = None
+    async for output in stream:
+        final_output = output
+        if i == 0:
+            # wait for generation to complete before consuming
+            # the remaining messages
+            await asyncio.sleep(1)
+        if i < 9:
+            assert not output.finished
+        i += 1
+
+    assert i == 10
+    assert final_output is not None
+    assert len(final_output.outputs[0].token_ids) == 10
+    assert final_output.finished
@@ -0,0 +1,55 @@
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.58
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len", "4096", "--enable-chunked-prefill",
+        "--disable-log-requests", "--enforce-eager"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_data(server):
+    return {
+        "url": f"{server.url_for('v1')}/completions",
+    }
+
+
+def test_lm_eval_accuracy(server_data):
+    model_args = (f"model={MODEL_NAME},"
+                  f"base_url={server_data['url']},"
+                  f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")
+
+    results = lm_eval.simple_evaluate(
+        model="local-completions",
+        model_args=model_args,
+        tasks=TASK,
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -2,8 +2,8 @@
 import time
 from dataclasses import dataclass
 from functools import partial
-from typing import (AsyncGenerator, Callable, Dict, Iterable, List, Mapping,
-                    Optional, Set, Tuple, Type, Union)
+from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
+                    Mapping, Optional, Set, Tuple, Type, Union)
 
 import torch
 from typing_extensions import assert_never
@@ -85,9 +85,8 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
 
     def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
                               Exception]) -> None:
-        if self._finished:
-            return
-        self._queue.put_nowait(item)
+        if not self._finished:
+            self._queue.put_nowait(item)
 
     def finish(
         self,
@@ -96,7 +95,7 @@ def finish(
         if not self._finished:
             self._finished = True
             self._queue.put_nowait(
-                exception if exception is not None else STOP_ITERATION)
+                exception if self._is_raisable(exception) else STOP_ITERATION)
 
     @property
     def finished(self) -> bool:
@@ -106,9 +105,9 @@ async def generator(
         self
     ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
         try:
-            while not self._finished:
+            while True:
                 result = await self._queue.get()
-                if isinstance(result, Exception):
+                if self._is_raisable(result):
                     if result == STOP_ITERATION:
                         return
                     raise result
@@ -117,6 +116,12 @@ async def generator(
             self._cancel(self.request_id)
             raise asyncio.CancelledError from None
 
+    @staticmethod
+    def _is_raisable(value: Any):
+        return isinstance(value, BaseException) or \
+                (isinstance(value, type) and \
+                 issubclass(value, BaseException))
+
 
 class RequestTracker:
     """Synchronous abstraction for tracking requests."""
@@ -761,6 +766,11 @@ def is_stopped(self) -> bool:
     def errored(self) -> bool:
         return self._errored_with is not None
 
+    @property
+    def limit_concurrency(self) -> Optional[int]:
+        """Maximum number of concurrently running requests."""
+        return None
+
     def set_errored(self, exc: Exception) -> None:
         self._errored_with = exc
 

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -29,6 +29,10 @@ def is_stopped(self) -> bool:
     def errored(self) -> bool:
         ...
 
+    @property
+    def limit_concurrency(self) -> Optional[int]:
+        """Maximum number of concurrently running requests."""
+
     def generate(
         self,
         inputs: PromptInputs,

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
@@ -27,6 +27,15 @@ async def serve_http(app: FastAPI, engine: AsyncEngineClient,
 
         logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
 
+    # Set concurrency limits in uvicorn if running in multiprocessing mode
+    # since zmq has maximum socket limit of zmq.constants.SOCKET_LIMIT (65536).
+    if engine.limit_concurrency is not None:
+        logger.info(
+            "Launching Uvicorn with --limit_concurrency %s. To avoid this "
+            "limit at the expense of performance run with "
+            "--disable-frontend-multiprocessing", engine.limit_concurrency)
+        uvicorn_kwargs["limit_concurrency"] = engine.limit_concurrency
+
     config = uvicorn.Config(app, **uvicorn_kwargs)
     server = uvicorn.Server(config)
     _add_shutdown_handlers(app, server, engine)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -135,6 +135,12 @@ async def build_async_engine_client(
         logger.info("Multiprocessing frontend to use %s for RPC Path.",
                     rpc_path)
 
+        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
+        # NOTE: Actually, this is not true yet. We still need to support
+        # embedding models via RPC (see TODO above)
+        rpc_client = AsyncEngineRPCClient(rpc_path)
+        async_engine_client = rpc_client  # type: ignore
+
         # Start RPCServer in separate process (holds the AsyncLLMEngine).
         context = multiprocessing.get_context("spawn")
         # the current process might have CUDA context,
@@ -145,11 +151,6 @@ async def build_async_engine_client(
         rpc_server_process.start()
         logger.info("Started engine process with PID %d",
                     rpc_server_process.pid)
-        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
-        # NOTE: Actually, this is not true yet. We still need to support
-        # embedding models via RPC (see TODO above)
-        rpc_client = AsyncEngineRPCClient(rpc_path)
-        async_engine_client = rpc_client  # type: ignore
 
         try:
             while True:

diff --git a/vllm/entrypoints/openai/rpc/__init__.py b/vllm/entrypoints/openai/rpc/__init__.py
@@ -7,8 +7,18 @@
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 
+# Success string used for RPC instructions.
 VLLM_RPC_SUCCESS_STR = "SUCCESS"
-VLLM_RPC_HEALTHY_STR = "HEALTHY"
+
+# Timeouts.
+VLLM_RPC_SERVER_START_TIMEOUT_MS = 1000
+VLLM_RPC_HEALTH_TIMEOUT_MS = 10000
+
+# Minimum value of ZMQ.SOCKET_LIMIT to run mp.
+VLLM_RPC_SOCKET_LIMIT_CUTOFF = 2000
+
+# HWM is set to Infinity.
+VLLM_RPC_ZMQ_HWM = 0
 
 
 @dataclass
@@ -34,7 +44,7 @@ class RPCUtilityRequest(Enum):
     GET_SCHEDULER_CONFIG = 5
     GET_LORA_CONFIG = 6
     DO_LOG_STATS = 7
-    CHECK_HEALTH = 8
+    IS_SERVER_HEALTHY = 8
     IS_TRACING_ENABLED = 9