Skip to content

Commit

Permalink
✨ add tgi_request_duration histogram
Browse files Browse the repository at this point in the history
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
  • Loading branch information
joerunde committed Jun 5, 2024
1 parent 3e9a627 commit c505577
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
4 changes: 3 additions & 1 deletion vllm/entrypoints/grpc/grpc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ async def Generate(self, request: BatchedGenerationRequest,
logs.log_response(request=request, response=response,
start_time=start_time, engine_metrics=res.metrics,
sub_request_num=i, logger=logger)
service_metrics.observe_generation_success(start_time=start_time)
responses[i] = response

return BatchedGenerationResponse(responses=responses)
Expand Down Expand Up @@ -279,6 +280,7 @@ async def GenerateStream(
engine_metrics=last_engine_response.metrics
if last_engine_response else None,
logger=logger)
service_metrics.observe_generation_success(start_time=start_time)

def _convert_input_details(
self, result: RequestOutput, resp_options: ResponseOptions,
Expand Down Expand Up @@ -569,7 +571,7 @@ async def _validate_prompt_and_tokenize(
@log_rpc_handler_errors
async def Tokenize(self, request: BatchedTokenizeRequest,
context: ServicerContext) -> BatchedTokenizeResponse:
service_metrics.observe_tokenization_request(request)
service_metrics.count_tokenization_request(request)
#TODO implement these
if request.return_offsets:
await context.abort(StatusCode.INVALID_ARGUMENT,
Expand Down
12 changes: 11 additions & 1 deletion vllm/tgis_utils/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Implements the logging for all tgi_* metrics for compatibility
with TGIS opsviz"""
from enum import StrEnum, auto
import time

from prometheus_client import Counter, Gauge, Histogram

Expand Down Expand Up @@ -54,8 +55,13 @@ def __init__(self):
"tgi_request_queue_duration",
documentation="Request time spent in queue (in seconds)",
buckets=_duration_buckets)
# Total response time from server
self.tgi_request_duration = Histogram(
"tgi_request_duration",
documentation="End-to-end generate request duration (in seconds)",
buckets=_duration_buckets)

def observe_tokenization_request(self, request: BatchedTokenizeRequest):
def count_tokenization_request(self, request: BatchedTokenizeRequest):
self.tgi_tokenize_request_input_count.inc(len(request.requests))

def observe_tokenization_response(self, response: BatchedTokenizeResponse):
Expand All @@ -73,6 +79,10 @@ def observe_queue_time(self, engine_output: RequestOutput):
def count_request_failure(self, reason: FailureReasonLabel):
self.tgi_request_failure.labels(err=reason).inc(1)

def observe_generation_success(self, start_time: float):
duration = time.time() - start_time
self.tgi_request_duration.observe(duration)


class TGISStatLogger(StatLogger):
"""Instance wraps the vLLM StatLogger to report TGIS metric names
Expand Down

0 comments on commit c505577

Please sign in to comment.