From c9e1972cb367dc939fc63ae47da7331ad495e400 Mon Sep 17 00:00:00 2001 From: brosoul Date: Fri, 13 Sep 2024 11:23:03 +0800 Subject: [PATCH] Metric standardizing in ai runtime (#163) * feat: init standard metric framework * feat: add engine metric scrape support * format * format * adjust server run port * mv constant config to config.py * style * fix: vllm metric standard rule --- python/aibrix/aibrix/config.py | 3 + python/aibrix/aibrix/downloader/s3.py | 2 +- python/aibrix/aibrix/envs.py | 13 ++++ python/aibrix/aibrix/metrics/__init__.py | 13 ++++ python/aibrix/aibrix/metrics/engine_rules.py | 48 +++++++++++++ .../aibrix/aibrix/metrics/http_collector.py | 70 +++++++++++++++++++ .../aibrix/aibrix/metrics/standard_rules.py | 58 +++++++++++++++ python/aibrix/app.py | 70 ++++++++++++++++++- python/aibrix/poetry.lock | 55 ++++++++++++++- python/aibrix/pyproject.toml | 2 + 10 files changed, 331 insertions(+), 3 deletions(-) create mode 100644 python/aibrix/aibrix/metrics/__init__.py create mode 100644 python/aibrix/aibrix/metrics/engine_rules.py create mode 100644 python/aibrix/aibrix/metrics/http_collector.py create mode 100644 python/aibrix/aibrix/metrics/standard_rules.py diff --git a/python/aibrix/aibrix/config.py b/python/aibrix/aibrix/config.py index 6461ec1a..bd0708c3 100644 --- a/python/aibrix/aibrix/config.py +++ b/python/aibrix/aibrix/config.py @@ -11,3 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + + +DEFAULT_METRIC_COLLECTOR_TIMEOUT = 1 diff --git a/python/aibrix/aibrix/downloader/s3.py b/python/aibrix/aibrix/downloader/s3.py index 99f5fe96..c067f904 100644 --- a/python/aibrix/aibrix/downloader/s3.py +++ b/python/aibrix/aibrix/downloader/s3.py @@ -18,7 +18,7 @@ import boto3 from boto3.s3.transfer import TransferConfig -from botocore.config import Config, MAX_POOL_CONNECTIONS +from botocore.config import MAX_POOL_CONNECTIONS, Config from tqdm import tqdm from aibrix import envs diff --git a/python/aibrix/aibrix/envs.py b/python/aibrix/aibrix/envs.py index 36bc201f..f3742679 100644 --- a/python/aibrix/aibrix/envs.py +++ b/python/aibrix/aibrix/envs.py @@ -36,6 +36,9 @@ def _parse_int_or_none(value: Optional[str]) -> Optional[int]: return int(value) +# Runtime Server Config +SERVER_PORT = int(os.getenv("SERVER_PORT", "8080")) + # Model Download Related Config # Downloader Default Directory @@ -71,3 +74,13 @@ def _parse_int_or_none(value: Optional[str]) -> Optional[int]: DOWNLOADER_AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") DOWNLOADER_AWS_ENDPOINT = os.getenv("AWS_ENDPOINT_URL") DOWNLOADER_AWS_REGION = os.getenv("AWS_REGION") + +# Metric Standardizing Related Config +# Scrape config +METRIC_SCRAPE_HOST = os.getenv("METRIC_SCRAPE_HOST", "localhost") +METRIC_SCRAPE_PORT = int(os.getenv("METRIC_SCRAPE_PORT", "8000")) +METRIC_SCRAPE_PATH = os.getenv("METRIC_SCRAPE_PATH", "/metrics") +METRIC_SCRAPE_ENGINE = os.getenv("METRIC_SCRAPE_ENGINE", "vllm") + +# Runtime Metric config +PROMETHEUS_MULTIPROC_DIR = os.getenv("PROMETHEUS_MULTIPROC_DIR", "/tmp/aibrix/metrics/") diff --git a/python/aibrix/aibrix/metrics/__init__.py b/python/aibrix/aibrix/metrics/__init__.py new file mode 100644 index 00000000..6461ec1a --- /dev/null +++ b/python/aibrix/aibrix/metrics/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2024 The Aibrix Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/aibrix/aibrix/metrics/engine_rules.py b/python/aibrix/aibrix/metrics/engine_rules.py new file mode 100644 index 00000000..7738abe0 --- /dev/null +++ b/python/aibrix/aibrix/metrics/engine_rules.py @@ -0,0 +1,48 @@ +# Copyright 2024 The Aibrix Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from aibrix.metrics.standard_rules import RenameStandardRule, StandardRule + +# Standard rule accroding to https://docs.google.com/document/d/1SpSp1E6moa4HSrJnS4x3NpLuj88sMXr2tbofKlzTZpk +VLLM_METRIC_STANDARD_RULES: Dict[str, StandardRule] = { + "vllm:request_success": RenameStandardRule( + "vllm:request_success", "aibrix:request_success" + ), + "vllm:num_requests_waiting": RenameStandardRule( + "vllm:num_requests_waiting", "aibrix:queue_size" + ), + "vllm:time_to_first_token_seconds": RenameStandardRule( + "vllm:time_to_first_token_seconds", "aibrix:time_to_first_token_seconds" + ), + "vllm:gpu_cache_usage_perc": RenameStandardRule( + "vllm:gpu_cache_usage_perc", "aibrix:gpu_cache_usage_perc" + ), + "vllm:time_per_output_token_seconds": RenameStandardRule( + "vllm:time_per_output_token_seconds", "aibrix:time_per_output_token" + ), + "vllm:e2e_request_latency_seconds": RenameStandardRule( + "vllm:e2e_request_latency_seconds", "aibrix:e2e_request_latency" + ), +} + +# TODO add more engine standard rules + + +def get_metric_standard_rules(engine: str) -> Dict[str, StandardRule]: + if engine == "vllm": + return VLLM_METRIC_STANDARD_RULES + else: + raise ValueError(f"Engine {engine} is not supported.") diff --git a/python/aibrix/aibrix/metrics/http_collector.py b/python/aibrix/aibrix/metrics/http_collector.py new file mode 100644 index 00000000..2c85f13a --- /dev/null +++ b/python/aibrix/aibrix/metrics/http_collector.py @@ -0,0 +1,70 @@ +# Copyright 2024 The Aibrix Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +import requests +from prometheus_client.parser import text_string_to_metric_families +from prometheus_client.registry import Collector + +from aibrix.config import DEFAULT_METRIC_COLLECTOR_TIMEOUT +from aibrix.logger import init_logger +from aibrix.metrics.standard_rules import StandardRule + +logger = init_logger(__name__) + + +class HTTPCollector(Collector): + def __init__( + self, + endpoint: str, + metrics_rules: Dict[str, StandardRule], + keep_original_metric: bool = True, + timeout=DEFAULT_METRIC_COLLECTOR_TIMEOUT, + ): + self.metric_endpoint = endpoint + self.metrics_rules = metrics_rules + self.keep_original_metric = keep_original_metric + + self.timeout = timeout + self.session = requests.Session() + + def _collect(self): + try: + response = self.session.get(self.metric_endpoint, timeout=self.timeout) + if response.status_code != 200: + logger.warning( + f"Failed to collect metrics from {self.metric_endpoint} " + f"with status code {response.status_code}, " + f"response: {response.text}" + ) + return "" + return response.text + except Exception as e: + logger.warning( + f"Failed to collect metrics from {self.metric_endpoint}: {e}" + ) + return "" + + def collect(self): + metrics_text = self._collect() + for m in text_string_to_metric_families(metrics_text): + if self.keep_original_metric: + yield m + + # metric standardizing rule matched + if m.name in self.metrics_rules: + new_metric = self.metrics_rules[m.name](m) + if new_metric is not None: + yield from new_metric diff --git a/python/aibrix/aibrix/metrics/standard_rules.py b/python/aibrix/aibrix/metrics/standard_rules.py new file mode 100644 index 00000000..705de24a --- /dev/null +++ b/python/aibrix/aibrix/metrics/standard_rules.py @@ -0,0 +1,58 @@ +# Copyright 2024 The Aibrix Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from abc import abstractmethod +from typing import Iterable + +from prometheus_client import Metric +from prometheus_client.samples import Sample + + +class StandardRule: + def __init__(self, rule_type): + self.rule_type = rule_type + + @abstractmethod + def __call__(self, metric: Metric) -> Iterable[Metric]: + pass + + +class RenameStandardRule(StandardRule): + def __init__(self, original_name, new_name): + super().__init__("RENAME") + self.original_name = original_name + self.new_name = new_name + + def __call__(self, metric: Metric) -> Iterable[Metric]: + assert ( + metric.name == self.original_name + ), f"Metric name {metric.name} does not match Rule original name {self.original_name}" + metric.name = self.new_name + + # rename all the samples + _samples = [] + for s in metric.samples: + s_name = self.new_name + s.name[len(self.original_name) :] + _samples.append( + Sample( + s_name, + s.labels, + s.value, + s.timestamp, + s.exemplar, + ) + ) + metric.samples = _samples + yield metric diff --git a/python/aibrix/app.py b/python/aibrix/app.py index 699aac0f..2405b72a 100644 --- a/python/aibrix/app.py +++ b/python/aibrix/app.py @@ -1,3 +1,71 @@ +import os +import shutil +from pathlib import Path + +import uvicorn from fastapi import FastAPI +from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess +from starlette.routing import Mount + +from aibrix import envs +from aibrix.logger import init_logger +from aibrix.metrics.engine_rules import get_metric_standard_rules +from aibrix.metrics.http_collector import HTTPCollector + +logger = init_logger(__name__) + + +def initial_prometheus_multiproc_dir(): + if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: + prometheus_multiproc_dir = envs.PROMETHEUS_MULTIPROC_DIR + else: + prometheus_multiproc_dir = os.environ["PROMETHEUS_MULTIPROC_DIR"] + + # Note: ensure it will be automatically cleaned up upon exit. + path = Path(prometheus_multiproc_dir) + path.mkdir(parents=True, exist_ok=True) + if path.is_dir(): + for item in path.iterdir(): + if item.is_dir(): + shutil.rmtree(item) + else: + item.unlink() + os.environ["PROMETHEUS_MULTIPROC_DIR"] = envs.PROMETHEUS_MULTIPROC_DIR + + +def mount_metrics(app: FastAPI): + # setup multiprocess collector + initial_prometheus_multiproc_dir() + prometheus_multiproc_dir_path = os.environ["PROMETHEUS_MULTIPROC_DIR"] + logger.info( + f"AIBrix to use {prometheus_multiproc_dir_path} as PROMETHEUS_MULTIPROC_DIR" + ) + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + + # construct scrape metric config + engine = envs.METRIC_SCRAPE_ENGINE + scrape_host = envs.METRIC_SCRAPE_HOST + scrape_port = envs.METRIC_SCRAPE_PORT + scrape_path = envs.METRIC_SCRAPE_PATH + scrape_endpoint = f"http://{scrape_host}:{scrape_port}{scrape_path}" + collector = HTTPCollector(scrape_endpoint, get_metric_standard_rules(engine)) + registry.register(collector) + logger.info( + f"AIBrix to scrape metrics from {scrape_endpoint}, use {engine} standard rules" + ) + + # Add prometheus asgi middleware to route /metrics requests + metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) + + app.routes.append(metrics_route) + + +def build_app(): + app = FastAPI(debug=False) + mount_metrics(app) + return app + -app = FastAPI(debug=False) +app = build_app() +uvicorn.run(app, port=envs.SERVER_PORT) diff --git a/python/aibrix/poetry.lock b/python/aibrix/poetry.lock index 12c365e3..3f4355ea 100644 --- a/python/aibrix/poetry.lock +++ b/python/aibrix/poetry.lock @@ -511,6 +511,20 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "prometheus-client" +version = "0.20.0" +description = "Python client for the Prometheus monitoring system." +optional = false +python-versions = ">=3.8" +files = [ + {file = "prometheus_client-0.20.0-py3-none-any.whl", hash = "sha256:cde524a85bce83ca359cc837f28b8c0db5cac7aa653a588fd7e84ba061c329e7"}, + {file = "prometheus_client-0.20.0.tar.gz", hash = "sha256:287629d00b147a32dcb2be0b9df905da599b2d82f80377083ec8463309a4bb89"}, +] + +[package.extras] +twisted = ["twisted"] + [[package]] name = "pydantic" version = "2.9.1" @@ -894,6 +908,45 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "types-requests" +version = "2.31.0.6" +description = "Typing stubs for requests" +optional = false +python-versions = ">=3.7" +files = [ + {file = "types-requests-2.31.0.6.tar.gz", hash = "sha256:cd74ce3b53c461f1228a9b783929ac73a666658f223e28ed29753771477b3bd0"}, + {file = "types_requests-2.31.0.6-py3-none-any.whl", hash = "sha256:a2db9cb228a81da8348b49ad6db3f5519452dd20a9c1e1a868c83c5fe88fd1a9"}, +] + +[package.dependencies] +types-urllib3 = "*" + +[[package]] +name = "types-requests" +version = "2.32.0.20240907" +description = "Typing stubs for requests" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-requests-2.32.0.20240907.tar.gz", hash = "sha256:ff33935f061b5e81ec87997e91050f7b4af4f82027a7a7a9d9aaea04a963fdf8"}, + {file = "types_requests-2.32.0.20240907-py3-none-any.whl", hash = "sha256:1d1e79faeaf9d42def77f3c304893dea17a97cae98168ac69f3cb465516ee8da"}, +] + +[package.dependencies] +urllib3 = ">=2" + +[[package]] +name = "types-urllib3" +version = "1.26.25.14" +description = "Typing stubs for urllib3" +optional = false +python-versions = "*" +files = [ + {file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"}, + {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1039,4 +1092,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "dfb5ae80314f83f8a52ef0861b64903894b78284f19bcc189545b65bbb809595" +content-hash = "b4835c6566f0d69ee7fa311b899e3a6a581a68f6bae5177720100f15606fa024" diff --git a/python/aibrix/pyproject.toml b/python/aibrix/pyproject.toml index df808dcc..e6b1895b 100644 --- a/python/aibrix/pyproject.toml +++ b/python/aibrix/pyproject.toml @@ -38,6 +38,8 @@ boto3 = "^1.35.5" fastapi = "^0.112.2" gunicorn = "^23.0.0" uvicorn = "^0.30.6" +prometheus-client = "^0.20.0" +types-requests = "^2.31.0" [tool.poetry.group.dev.dependencies]