From 233df6f5c4520ae57e4a24acfbaedcc9ce166074 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Wed, 5 Feb 2025 00:46:54 +0000 Subject: [PATCH] [V1][Metrics] Add request_success_total counter, labelled with finish reason (#12579) Signed-off-by: Mark McLoughlin --- tests/entrypoints/openai/test_metrics.py | 1 + vllm/v1/engine/__init__.py | 21 +++++++++++++++++++-- vllm/v1/engine/detokenizer.py | 9 +++++---- vllm/v1/engine/output_processor.py | 22 ++++++++++++---------- vllm/v1/metrics/loggers.py | 15 ++++++++++++++- vllm/v1/metrics/stats.py | 10 +++++++--- vllm/v1/request.py | 15 ++++++++------- 7 files changed, 66 insertions(+), 27 deletions(-) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index a9134be623229..de2333901cc91 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -205,6 +205,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "vllm:gpu_cache_usage_perc", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", + "vllm:request_success_total", "vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_bucket", "vllm:request_prompt_tokens_count", diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 912b92862c96e..6bd548bdcd8e1 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -15,6 +15,23 @@ from vllm.sampling_params import SamplingParams +class RequestFinishedReason(enum.IntEnum): + """ + Reason a request finished - stop, length, or abort. + + stop - a stop string was emitted + length - max_tokens was consumed, or max_model_len was reached + abort - aborted for another reason + + """ + STOP = 0 + LENGTH = 1 + ABORT = 2 + + def __str__(self): + return self.name.lower() + + @dataclass class EngineCoreRequest: @@ -45,7 +62,7 @@ class EngineCoreOutput( request_id: str new_token_ids: List[int] finished: bool - finish_reason: Optional[str] = None + finish_reason: Optional[RequestFinishedReason] = None stop_reason: Union[int, str, None] = None @@ -56,7 +73,7 @@ class EngineCoreOutputs( gc=False): # type: ignore[call-arg] #NOTE(Nick): We could consider ways to make this more compact, - # e.g. columnwise layout and using an int enum for finish/stop reason + # e.g. columnwise layout # [num_reqs] outputs: List[EngineCoreOutput] diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 6d800f026b22a..2bce23e68d27b 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -8,7 +8,8 @@ from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine import (EngineCoreOutput, EngineCoreRequest, + RequestFinishedReason) logger = init_logger(__name__) @@ -18,7 +19,7 @@ class DetokenizerOutput: output_text: str token_ids: List[int] finished: bool - finish_reason: Optional[str] = None + finish_reason: Optional[RequestFinishedReason] = None stop_reason: Union[int, str, None] = None @@ -147,13 +148,13 @@ def update_from_output( stop_str, truncate_to = stop if truncate_to != -1: self.output_text = self.output_text[:truncate_to] - finish_reason = "stop" # TODO: use constant + finish_reason = RequestFinishedReason.STOP stop_reason = stop_str # TODO: handle stop_token_ids here too? # 3) Update the RequestOutput object with the new text. - finished = bool(finish_reason) + finished = finish_reason is not None if self.output_kind == RequestOutputKind.FINAL_ONLY \ and not finished: return None diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index aeefd52399d53..9473666914717 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -161,8 +161,10 @@ def process_outputs( engine_core_output) # 3) Create and handle RequestOutput objects. - if request_output := self._make_request_output( - req_state, detokenizer_output): + if detokenizer_output is not None: + request_output = self._make_request_output( + req_state, detokenizer_output) + if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put_nowait(request_output) @@ -172,6 +174,8 @@ def process_outputs( # Free completed requests. if request_output.finished: + assert detokenizer_output.finish_reason is not None + self.request_states.pop(req_id) if not engine_core_output.finished: # If req not finished in EngineCore, but Detokenizer @@ -180,7 +184,8 @@ def process_outputs( # Track per-request stats iteration_stats.update_from_finished_request( - request_output, req_state.stats) + detokenizer_output.finish_reason, request_output, + req_state.stats) return OutputProcessorOutput( request_outputs=request_outputs, @@ -191,12 +196,8 @@ def process_outputs( @staticmethod def _make_request_output( request_state: RequestState, - detokenizer_output: Optional[DetokenizerOutput], - ) -> Optional[RequestOutput]: - - if detokenizer_output is None: - return None - + detokenizer_output: DetokenizerOutput, + ) -> RequestOutput: request_output = RequestOutput.new( request_state.request_id, request_state.prompt, @@ -207,7 +208,8 @@ def _make_request_output( ) if detokenizer_output.finished: completion_output = request_output.outputs[0] - completion_output.finish_reason = detokenizer_output.finish_reason + completion_output.finish_reason = str( + detokenizer_output.finish_reason) completion_output.stop_reason = detokenizer_output.stop_reason return request_output diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f736e38f192d1..b62351a8fd6e3 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -2,13 +2,14 @@ import time from abc import ABC, abstractmethod -from typing import List +from typing import Dict, List import numpy as np import prometheus_client from vllm.config import ModelConfig from vllm.logger import init_logger +from vllm.v1.engine import RequestFinishedReason from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -116,6 +117,17 @@ def __init__(self, model_config: ModelConfig): documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) + self.counter_request_success: Dict[RequestFinishedReason, + prometheus_client.Counter] = {} + counter_request_success_base = prometheus_client.Counter( + name="vllm:request_success_total", + documentation="Count of successfully processed requests.", + labelnames=labelnames + ["finished_reason"]) + for reason in RequestFinishedReason: + self.counter_request_success[ + reason] = counter_request_success_base.labels(*(labelvalues + + [str(reason)])) + self.histogram_num_prompt_tokens_request = \ prometheus_client.Histogram( name="vllm:request_prompt_tokens", @@ -163,6 +175,7 @@ def log(self, scheduler_stats: SchedulerStats, iteration_stats.num_generation_tokens) for finished_request in iteration_stats.finished_requests: + self.counter_request_success[finished_request.finish_reason].inc() self.histogram_num_prompt_tokens_request.observe( finished_request.num_prompt_tokens) self.histogram_num_generation_tokens_request.observe( diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 88f2c083530ed..36c95e07d8a97 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from vllm.outputs import RequestOutput - from vllm.v1.engine import EngineCoreOutput + from vllm.v1.engine import EngineCoreOutput, RequestFinishedReason @dataclass @@ -32,6 +32,7 @@ class RequestStateStats: class FinishedRequestStats: """Stats associated with a finished request.""" + finish_reason: "RequestFinishedReason" num_prompt_tokens: int = 0 num_generation_tokens: int = 0 @@ -73,8 +74,11 @@ def update_from_output(self, output: "EngineCoreOutput", request_state_stats.num_generation_tokens += num_new_generation_tokens request_state_stats.last_token_time = now - def update_from_finished_request(self, request_output: "RequestOutput", + def update_from_finished_request(self, + finish_reason: "RequestFinishedReason", + request_output: "RequestOutput", request_state_stats: RequestStateStats): self.finished_requests.append( - FinishedRequestStats(len(request_output.prompt_token_ids), + FinishedRequestStats(finish_reason, + len(request_output.prompt_token_ids), request_state_stats.num_generation_tokens)) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 0519d9e787518..eb9bf99b406f8 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -6,7 +6,7 @@ from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics -from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine import EngineCoreRequest, RequestFinishedReason from vllm.v1.utils import ConstantList if TYPE_CHECKING: @@ -109,7 +109,7 @@ def num_output_tokens(self) -> int: def is_finished(self) -> bool: return RequestStatus.is_finished(self.status) - def get_finished_reason(self) -> Union[str, None]: + def get_finished_reason(self) -> Union[RequestFinishedReason, None]: return RequestStatus.get_finished_reason(self.status) def has_encoder_inputs(self) -> bool: @@ -149,7 +149,8 @@ def is_finished(status: "RequestStatus") -> bool: return status > RequestStatus.PREEMPTED @staticmethod - def get_finished_reason(status: "RequestStatus") -> Union[str, None]: + def get_finished_reason( + status: "RequestStatus") -> Union[RequestFinishedReason, None]: return _FINISHED_REASON_MAP.get(status) @@ -158,8 +159,8 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]: # are longer than the model's length cap. Therefore, the stop # reason should also be "length" as in OpenAI API. _FINISHED_REASON_MAP = { - RequestStatus.FINISHED_STOPPED: "stop", - RequestStatus.FINISHED_LENGTH_CAPPED: "length", - RequestStatus.FINISHED_ABORTED: "abort", - RequestStatus.FINISHED_IGNORED: "length", + RequestStatus.FINISHED_STOPPED: RequestFinishedReason.STOP, + RequestStatus.FINISHED_LENGTH_CAPPED: RequestFinishedReason.LENGTH, + RequestStatus.FINISHED_ABORTED: RequestFinishedReason.ABORT, + RequestStatus.FINISHED_IGNORED: RequestFinishedReason.LENGTH, }