IBM · joerunde · Jun 11, 2024 · May 24, 2024 · May 28, 2024 · May 28, 2024
diff --git a/proto/generation.proto b/proto/generation.proto
@@ -27,16 +27,20 @@ enum DecodingMethod {
 
 message BatchedGenerationRequest {
   string model_id = 1;
+  // Deprecated in favor of adapter_id
   optional string prefix_id = 2;
   repeated GenerationRequest requests = 3;
+  optional string adapter_id = 4;
 
   Parameters params = 10;
 }
 
 message SingleGenerationRequest {
   string model_id = 1;
+  // Deprecated in favor of adapter_id
   optional string prefix_id = 2;
   GenerationRequest request = 3;
+  optional string adapter_id = 4;
 
   Parameters params = 10;
 }

diff --git a/vllm/entrypoints/grpc/adapters.py b/vllm/entrypoints/grpc/adapters.py
@@ -0,0 +1,81 @@
+"""Contains code to map api requests for adapters (e.g. peft prefixes, LoRA)
+into valid LLM engine requests"""
+import dataclasses
+import json
+import os
+from typing import Dict, Optional, Union
+
+from vllm.entrypoints.grpc.pb.generation_pb2 import (BatchedGenerationRequest,
+                                                     SingleGenerationRequest)
+from vllm.entrypoints.grpc.validation import TGISValidationError
+from vllm.lora.request import LoRARequest
+
+
+@dataclasses.dataclass
+class AdapterMetadata:
+    unique_id: int  # Unique integer for vllm to identify the adapter
+    adapter_type: str  # The string name of the peft adapter type, e.g. LORA
+    full_path: str
+
+
+@dataclasses.dataclass
+class AdapterStore:
+    cache_path: str  # Path to local store of adapters to load from
+    adapters: Dict[str, AdapterMetadata]
+    next_unique_id: int = 1
+
+
+def validate_adapters(
+        request: Union[SingleGenerationRequest, BatchedGenerationRequest],
+        adapter_store: Optional[AdapterStore]) -> Dict[str, LoRARequest]:
+    """Takes the adapter name from the request and constructs a valid
+        engine request if one is set. Raises if the requested adapter
+        does not exist or adapter type is unsupported
+
+        Returns the kwarg dictionary to add to an engine.generate() call.
+        """
+    adapter_id = request.adapter_id
+
+    if adapter_id and not adapter_store:
+        TGISValidationError.AdaptersDisabled.error()
+
+    if not adapter_id or not adapter_store:
+        return {}
+
+    # If not already cached, we need to validate that files exist and
+    # grab the type out of the adapter_config.json file
+    if adapter_id not in adapter_store.adapters:
+        local_adapter_path = os.path.join(adapter_store.cache_path, adapter_id)
+
+        if not os.path.exists(local_adapter_path):
+            TGISValidationError.AdapterNotFound.error(
+                adapter_id, "directory does not exist")
+
+        adapter_config_path = os.path.join(local_adapter_path,
+                                           "adapter_config.json")
+        if not os.path.exists(adapter_config_path):
+            TGISValidationError.AdapterNotFound.error(
+                adapter_id, "invalid adapter: no adapter_config.json found")
+
+        # NB: blocks event loop
+        with open(adapter_config_path) as adapter_config_file:
+            adapter_config = json.load(adapter_config_file)
+
+        adapter_type = adapter_config.get("peft_type", None)
+
+        # Add to cache
+        adapter_store.adapters[adapter_id] = AdapterMetadata(
+            unique_id=adapter_store.next_unique_id,
+            adapter_type=adapter_type,
+            full_path=local_adapter_path)
+
+    # Build the proper vllm request object
+    adapter_metadata = adapter_store.adapters[adapter_id]
+    if adapter_metadata.adapter_type == "LORA":
+        lora_request = LoRARequest(lora_name=adapter_id,
+                                   lora_int_id=adapter_metadata.unique_id,
+                                   lora_local_path=adapter_metadata.full_path)
+        return {"lora_request": lora_request}
+
+    # All other types unsupported
+    TGISValidationError.AdapterUnsupported.error(adapter_metadata.adapter_type)
diff --git a/vllm/entrypoints/grpc/grpc_server.py b/vllm/entrypoints/grpc/grpc_server.py
@@ -14,6 +14,7 @@
 from vllm import (AsyncLLMEngine, CompletionOutput, RequestOutput,
                   SamplingParams)
 from vllm.config import ModelConfig
+from vllm.entrypoints.grpc.adapters import AdapterStore, validate_adapters
 from vllm.entrypoints.grpc.pb import generation_pb2_grpc  # type: ignore
 # yapf: disable
 from vllm.entrypoints.grpc.pb.generation_pb2 import (BatchedGenerationRequest,
@@ -32,6 +33,7 @@
 from vllm.entrypoints.grpc.validation import validate_input, validate_params
 from vllm.entrypoints.openai.serving_completion import merge_async_iterators
 from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
 from vllm.sequence import Logprob
 from vllm.tgis_utils import logs
 from vllm.tgis_utils.logits_processors import (ExpDecayLengthPenaltyWarper,
@@ -116,9 +118,17 @@ def __init__(self, engine: AsyncLLMEngine, args: argparse.Namespace):
         self.skip_special_tokens = not args.output_special_tokens
         self.default_include_stop_seqs = args.default_include_stop_seqs
 
+        self.adapter_store: Optional[AdapterStore] = None
+        if args.adapter_cache:
+            self.adapter_store = AdapterStore(
+                cache_path=args.adapter_cache,
+                adapters={}
+            )
+
     async def _post_init(self):
         self.config = await self.engine.get_model_config()
-        self.tokenizer_group = await self.engine.get_tokenizer_group()
+        # self.tokenizer_group = await self.engine.get_tokenizer_group()
+        self.tokenizer_group = self.engine.engine.tokenizer
         self.tokenizer = await self.engine.get_tokenizer()
 
         # Swap in the special TGIS stats logger
@@ -144,6 +154,9 @@ async def Generate(self, request: BatchedGenerationRequest,
 
         generators = []
         max_is_token_limit = [False] * request_count
+
+        adapter_kwargs = await self._validate_adapters(request, context)
+
         for i, req in enumerate(request.requests):
             input_ids, max_is_token_limit[i]\
                 = await self._validate_prompt_and_tokenize(
@@ -154,7 +167,8 @@ async def Generate(self, request: BatchedGenerationRequest,
                 self.engine.generate(prompt=req.text,
                                      sampling_params=sampling_params,
                                      request_id=f"{request_id}-{i}",
-                                     prompt_token_ids=input_ids),
+                                     prompt_token_ids=input_ids,
+                                     **adapter_kwargs),
             )
 
         # TODO handle cancellation
@@ -210,13 +224,16 @@ async def GenerateStream(
             sampling_params, truncate_input_tokens, request.request.text,
             context)
 
+        adapter_kwargs, _ = await self._validate_adapters(request, context)
+
         result_generator = self.engine.generate(
             # prompt is supplied for observability, the text is not
             # re-tokenized when `prompt_token_ids` is supplied
             prompt=request.request.text,
             sampling_params=sampling_params,
             request_id=request_id,
             prompt_token_ids=input_ids,
+            **adapter_kwargs
         )
 
         resp_options = request.params.response
@@ -423,6 +440,19 @@ async def _validate_and_convert_params(
 
         return sampling_params, deadline
 
+    async def _validate_adapters(self,
+                                 request: Union[SingleGenerationRequest,
+                                                BatchedGenerationRequest],
+                                 context: ServicerContext) \
+            -> Dict[str, LoRARequest]:
+        try:
+            adapters = validate_adapters(
+                request=request, adapter_store=self.adapter_store)
+        except ValueError as e:
+            service_metrics.count_request_failure(FailureReasonLabel.VALIDATION)
+            await context.abort(StatusCode.INVALID_ARGUMENT, str(e))
+        return adapters
+
     @staticmethod
     def _convert_reason(output: CompletionOutput, max_is_token_limit: bool,
                         time_limit_reached: bool

diff --git a/vllm/entrypoints/grpc/validation.py b/vllm/entrypoints/grpc/validation.py
@@ -1,3 +1,4 @@
+import typing
 from enum import Enum
 
 from vllm import SamplingParams
@@ -39,8 +40,11 @@ class TGISValidationError(str, Enum):
 
     # Additions that are _not_ in TGIS
     TopN = "top_n_tokens ({0}) must be <= {1}"
+    AdapterNotFound = "can't retrieve adapter with id '{0}': {1}"
+    AdaptersDisabled = "adapter_id supplied but no adapter store was configured"
+    AdapterUnsupported = "adapter type {0} is not currently supported"
 
-    def error(self, *args, **kwargs):
+    def error(self, *args, **kwargs) -> typing.NoReturn:
         """Raises a ValueError with a nicely formatted string"""
         raise ValueError(self.value.format(*args, **kwargs))
 

diff --git a/vllm/tgis_utils/args.py b/vllm/tgis_utils/args.py
@@ -82,6 +82,8 @@ def add_tgis_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
     parser.add_argument('--tls-key-path', type=str)
     # map to ssl_ca_certs
     parser.add_argument('--tls-client-ca-cert-path', type=str)
+    # add a path when peft adapters will be loaded from
+    parser.add_argument('--adapter-cache', type=str)
 
     # TODO check/add other args here
 

diff --git a/vllm/tgis_utils/logs.py b/vllm/tgis_utils/logs.py
@@ -41,6 +41,7 @@ def log_response(
         response=response,
         params=request.params,
         prefix_id=request.prefix_id,
+        adapter_id=request.adapter_id,
         engine_metrics=engine_metrics,
         start_time=start_time,
         kind_log=kind_log,
@@ -57,6 +58,7 @@ def log_error(request: Union[BatchedGenerationRequest,
     # of just logging the simple string representation of the error
     param_str = text_format.MessageToString(request.params, as_one_line=True)
     prefix_id = request.prefix_id
+    adapter_id = request.adapter_id
 
     if isinstance(request, BatchedGenerationRequest):
         method_str = "generate"
@@ -69,13 +71,14 @@ def log_error(request: Union[BatchedGenerationRequest,
     input_chars = sum(len(input_) for input_ in inputs)
 
     span_str = (f"{method_str}{{input={short_input} prefix_id={prefix_id} "
-                f"input_chars=[{input_chars}] params={param_str}")
+                f"adapter_id={adapter_id} input_chars=[{input_chars}] "
+                f"params={param_str}")
 
     logger.error("%s: %s", span_str, exception_str)
 
 
 def _log_response(inputs: List[str], params: Parameters, prefix_id: str,
-                  response: GenerationResponse,
+                  adapter_id: str, response: GenerationResponse,
                   engine_metrics: Optional[RequestMetrics], start_time: float,
                   kind_log: str, method_str: str, logger: logging.Logger):
     """Logs responses similar to how the TGIS server does"""
@@ -99,6 +102,7 @@ def _log_response(inputs: List[str], params: Parameters, prefix_id: str,
 
     paramstr = text_format.MessageToString(params, as_one_line=True)
     span_str = (f"{method_str}{{input={short_input} prefix_id={prefix_id} "
+                f"adapter_id={adapter_id} "
                 f"input_chars=[{input_chars}] params={paramstr} "
                 f"tokenization_time={tokenization_time * 1e3:.2f}ms "
                 f"queue_time={queue_time * 1e3:.2f}ms "