vllm-project · esmeetu · Aug 9, 2024 · Aug 9, 2024
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -883,6 +883,7 @@ class AsyncEngineArgs(EngineArgs):
     """Arguments for asynchronous vLLM engine."""
     engine_use_ray: bool = False
     disable_log_requests: bool = False
+    engine_use_rpc: bool = False
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser,
@@ -896,6 +897,10 @@ def add_cli_args(parser: FlexibleArgumentParser,
         parser.add_argument('--disable-log-requests',
                             action='store_true',
                             help='Disable logging requests.')
+        parser.add_argument('--engine-use-rpc',
+                            action='store_true',
+                            help='Use RPC to communicate between api server '
+                            'and LLM engine.')
         return parser
 
 

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -43,7 +43,8 @@
     OpenAIServingTokenization)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
+from vllm.utils import (FlexibleArgumentParser, get_ip, get_open_zmq_ipc_path,
+                        get_rpc_port)
 from vllm.version import __version__ as VLLM_VERSION
 
 TIMEOUT_KEEP_ALIVE = 5  # seconds
@@ -106,10 +107,16 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]:
 
     # Otherwise, use the multiprocessing AsyncLLMEngine.
     else:
-        # Select random path for IPC.
-        rpc_path = get_open_zmq_ipc_path()
-        logger.info("Multiprocessing frontend to use %s for RPC Path.",
-                    rpc_path)
+        if engine_args.engine_use_rpc:
+            # Select random port for RPC.
+            rpc_port = get_rpc_port()
+            driver_ip = get_ip()
+            rpc_path = f"tcp://{driver_ip}:{rpc_port}"
+        else:
+            # Select random path for IPC.
+            rpc_path = get_open_zmq_ipc_path()
+
+        logger.info("vLLM Async engine use %s for RPC Path.", rpc_path)
 
         # Start RPCServer in separate process (holds the AsyncLLMEngine).
         rpc_server_process = Process(target=run_rpc_server,

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -150,6 +150,11 @@ def get_default_config_root():
     'VLLM_RPC_BASE_PATH':
     lambda: os.getenv('VLLM_RPC_BASE_PATH', tempfile.gettempdir()),
 
+    # port used for rpc when the frontend api server is running in
+    # multi-processing mode to communicate with the backend engine process.
+    'VLLM_RPC_PORT':
+    lambda: int(os.getenv('VLLM_RPC_PORT', '5570')),
+
     # If true, will load models from ModelScope instead of Hugging Face Hub.
     # note that the value is true or false, not numbers
     "VLLM_USE_MODELSCOPE":

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -527,8 +527,17 @@ def get_open_zmq_ipc_path() -> str:
     return f"ipc://{base_rpc_path}/{uuid4()}"
 
 
-def get_open_port() -> int:
+def get_rpc_port() -> int:
+    port = envs.VLLM_RPC_PORT
+    return get_port(port)
+
+
+def get_open_port(port: Optional[int] = None) -> int:
     port = envs.VLLM_PORT
+    return get_port(port)
+
+
+def get_port(port: Optional[int] = None) -> int:
     if port is not None:
         while True:
             try: