diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index c7277a64eece8..04e1de9d19686 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -68,14 +68,13 @@ def __init__( async def _embedding_score( self, tokenizer: Union[AnyTokenizer], - text_1: List[Union[List[str], str]], - text_2: List[Union[List[str], str]], + input_pairs: List[tuple], request: ScoreRequest, model_name=str, request_id=str, created_time=int, truncate_prompt_tokens: Optional[int] = None, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[LoRARequest | None]] = None, prompt_adapter_request: Optional[PromptAdapterRequest | None] = None, raw_request: Optional[Request] = None, ) -> Union[ScoreResponse, ErrorResponse]: @@ -84,7 +83,6 @@ async def _embedding_score( engine_prompts = [] try: - input_pairs = make_pairs(text_1, text_2) for q, t in input_pairs: request_prompt = f"{q}{tokenizer.sep_token}{t}" @@ -181,18 +179,14 @@ async def _embedding_score( result_generator = merge_async_iterators(*generators) - num_prompts = len(engine_prompts) - # Non-streaming response - final_res_batch: List[Optional[PoolingRequestOutput]] - final_res_batch = [None] * num_prompts + final_res_batch: List[Optional[PoolingRequestOutput]] = [] try: embeddings = [] async for i, res in result_generator: embeddings.append(res) - scores = [] scorer = torch.nn.CosineSimilarity(0) for i in range(0, len(embeddings), 2): @@ -208,7 +202,7 @@ async def _embedding_score( tokens = embeddings[i].prompt_token_ids + embeddings[ i + 1].prompt_token_ids - scores.append( + final_res_batch.append( PoolingRequestOutput( request_id= f"{embeddings[i].request_id}_{embeddings[i+1].request_id}", @@ -216,7 +210,6 @@ async def _embedding_score( prompt_token_ids=tokens, finished=True)) - final_res_batch = scores assert all(final_res is not None for final_res in final_res_batch) final_res_batch_checked = cast(List[PoolingRequestOutput], @@ -240,14 +233,13 @@ async def _embedding_score( async def _cross_encoding_score( self, tokenizer: Union[AnyTokenizer], - text_1: List[Union[List[str], str]], - text_2: List[Union[List[str], str]], + input_pairs: List[tuple], request: ScoreRequest, model_name=str, request_id=str, created_time=int, truncate_prompt_tokens: Optional[int] = None, - lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + lora_request: Optional[Union[LoRARequest | None]] = None, prompt_adapter_request: Optional[PromptAdapterRequest | None] = None, raw_request: Optional[Request] = None, ) -> Union[ScoreResponse, ErrorResponse]: @@ -260,7 +252,6 @@ async def _cross_encoding_score( raise ValueError( "MistralTokenizer not supported for cross-encoding") - input_pairs = make_pairs(text_1, text_2) for q, t in input_pairs: request_prompt = f"{q}{tokenizer.sep_token}{t}" @@ -394,11 +385,12 @@ async def create_score( logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) + input_pairs = make_pairs(request.text_1, request.text_2) + if self.model_config.is_cross_encoder: response = await self._cross_encoding_score( tokenizer=tokenizer, - text_1=request.text_1, - text_2=request.text_2, + input_pairs=input_pairs, request=request, model_name=model_name, request_id=request_id, @@ -411,8 +403,7 @@ async def create_score( else: response = await self._embedding_score( tokenizer=tokenizer, - text_1=request.text_1, - text_2=request.text_2, + input_pairs=input_pairs, request=request, model_name=model_name, request_id=request_id,