vllm-project · Jeffwan · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/docs/source/assets/logos/aibrix-logo-light.png b/docs/source/assets/logos/aibrix-logo-light.png
diff --git a/docs/source/assets/logos/aibrix-logo.jpeg b/docs/source/assets/logos/aibrix-logo.jpeg
diff --git a/docs/source/features/gateway-plugins.rst b/docs/source/features/gateway-plugins.rst
@@ -4,16 +4,10 @@
 Gateway Routing
 ===============
 
-Gateway provides features such as user configuration, budgeting, dynamically routing user requests to respective model deployment and provides advanced routing strategies for hetrogenous GPU hardware.
-
-Design
------------------------------
-
-TBD
-
+Gateway provides features such as user configuration, budgeting, dynamically routing user requests to respective model deployment and provides advanced routing strategies for heterogeneous GPU hardware.
 
 Dynamic Routing
-----------------------
+---------------
 
 Gateway dynamically creates a route for each model deployment without need for manual user configuration.
 During requests, gateway uses model name from the header to route request to respective model deployment. 
@@ -22,8 +16,6 @@ During requests, gateway uses model name from the header to route request to res
 .. code-block:: bash
 
     curl -v http://localhost:8888/v1/chat/completions \
-    -H "user: your-user-name" \
-    -H "model: your-model-name" \
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer any_key" \
     -d '{
@@ -33,8 +25,33 @@ During requests, gateway uses model name from the header to route request to res
     }'
 
 
+Rate Limiting
+-------------
+
+The gateway supports rate limiting based on the `user` header. You can specify a unique identifier for each `user` to apply rate limits such as requests per minute (RPM) or tokens per minute (TPM).
+This `user` header is essential for enabling rate limit support for each client.
+
+To set up rate limiting, add the user header in the request, like this:
+
+.. code-block:: bash
+
+    curl -v http://localhost:8888/v1/chat/completions \
+    -H "user: your-user-id" \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer any_key" \
+    -d '{
+        "model": "your-model-name",
+        "messages": [{"role": "user", "content": "Say this is a test!"}],
+        "temperature": 0.7
+    }'
+
+.. note::
+    Replace "your-user-id" with a unique identifier for each user. This identifier allows the gateway to enforce rate limits on a per-user basis.
+    If rate limit support is required, ensure this `user` header is always set in the request. if you do not need rate limit, you do not need to set this header.
+
+
 Routing Strategies
-----------------------
+------------------
 
 Gateway supports two routing strategies right now.
 1. least-request: routes request to a pod with least ongoing request.
@@ -44,12 +61,11 @@ Gateway supports two routing strategies right now.
 
     curl -v http://localhost:8888/v1/chat/completions \
     -H "user: your-user-name" \
-    -H "model: your-model-name" \
     -H "routing-strategy: least-request" \
     -H "Content-Type: application/json" \
     -H "Authorization: Bearer any_key" \
     -d '{
         "model": "your-model-name",
         "messages": [{"role": "user", "content": "Say this is a test!"}],
         "temperature": 0.7
-    }'
+    }'
diff --git a/docs/source/features/runtime.rst b/docs/source/features/runtime.rst
@@ -53,7 +53,6 @@ Then use AI Runtime to download the model from HuggingFace:
     python -m aibrix.downloader \
         --model-uri deepseek-ai/deepseek-coder-6.7b-instruct \
         --local-dir /tmp/aibrix/models_hf/
-
 
 
 Download From S3
@@ -106,7 +105,4 @@ Then use AI Runtime to download the model from TOS:
         --local-dir /tmp/aibrix/models_tos/
 
 
-Model Management
-------------------
-eagerly await
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,8 +1,8 @@
 Welcome to AIBrix
 =================
 
-.. image:: ./assets/logos/aibrix-logo-light.png
-  :width: 60%
+.. image:: ./assets/logos/aibrix-logo.jpeg
+  :width: 40%
   :align: center
   :alt: AIBrix
 

diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go
@@ -24,6 +24,7 @@ import (
 	"io"
 	"math"
 	"net/http"
+	"os"
 	"strconv"
 	"strings"
 	"sync"
@@ -59,19 +60,30 @@ type Cache struct {
 	requestTrace      map[string]map[string]int      // model_name: map[Log2(input_token)-Log2(output_token)]request_count
 }
 
+const (
+	modelIdentifier                       = "model.aibrix.ai/name"
+	podPort                               = 8000
+	defaultPodMetricRefreshIntervalInMS   = 50
+	writeRequestTraceIntervalInSeconds    = 10
+	expireWriteRequestTraceIntervalInMins = 10
+)
+
 var (
 	instance    Cache
 	metricNames = []string{"num_requests_running", "num_requests_waiting", "num_requests_swapped",
 		"avg_prompt_throughput_toks_per_s", "avg_generation_throughput_toks_per_s"} //, "e2e_request_latency_seconds_sum"}
+	podMetricRefreshIntervalInMilliseconds = getPodMetricRefreshInterval()
 )
 
-const (
-	modelIdentifier                        = "model.aibrix.ai/name"
-	podPort                                = 8000
-	podMetricRefreshIntervalInMilliseconds = 50
-	writeRequestTraceIntervalInSeconds     = 10
-	expireWriteRequestTraceIntervalInMins  = 10
-)
+func getPodMetricRefreshInterval() time.Duration {
+	value, exists := os.LookupEnv("AIBRIX_POD_METRIC_REFRESH_INTERVAL_MS")
+	if exists {
+		if intValue, err := strconv.Atoi(value); err == nil {
+			return time.Duration(intValue) * time.Millisecond
+		}
+	}
+	return time.Duration(defaultPodMetricRefreshIntervalInMS) * time.Millisecond
+}
 
 func GetCache() (*Cache, error) {
 	if !instance.initialized {