add request routers - least kv cache, least expected latency (#543)

* Add random adapter scheduler * Add leastExpectedLatency request router * Add least latency scheduler * Add least kv cache router * Add bin packing scheduler (first-fit as examole) * Add least utilization scheduler (RPM, TPM, kv_cache, busy_time as utilization) * Add least busy time (or least gpu utilization) router * Add weighted round robin router * Add metrics that scheduling needed (#486) * add scheduler metrics * add metrics into mock app * refact CacheUsagePerc of CPU and GPU * add instance label into promQL * 适配metrics接口 Change-Id: Icc2a017cb2db445fb760ced2c0034a65f9b37fa8 * add .vscode to gitignore Change-Id: I36a0f54ca1c8a3c16b89c0077df77a119440bed3 * fix mock cpu_cache_usage_perc metrics * feat: add least kv cache into route strategy * add 2 new routers * rm stateful router: weighted round robin * rm scheduler changes --------- Co-authored-by: chenbinbin <chenbinbin.1996@bytedance.com> Co-authored-by: chenzuzhi <chenzuzhi@bytedance.com> Co-authored-by: brosoul <brosoul@126.com>
vllm-project · Jan 3, 2025 · ca7b372 · ca7b372
1 parent b479e56
commit ca7b372
Show file tree

Hide file tree

Showing 5 changed files with 347 additions and 4 deletions.
diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go
@@ -90,6 +90,8 @@ var (
 		metrics.AvgGenerationThroughputToksPerS,
 		metrics.GPUCacheUsagePerc,
 		metrics.CPUCacheUsagePerc,
+		metrics.GPUCacheUsagePerc,
+		metrics.CPUCacheUsagePerc,
 	}
 	// histogram metric example - time_to_first_token_seconds, _sum, _bucket _count.
 	histogramMetricNames = []string{

diff --git a/pkg/plugins/gateway/algorithms/least_busy_time.go b/pkg/plugins/gateway/algorithms/least_busy_time.go
@@ -0,0 +1,86 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package routingalgorithms
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"math/rand"
+
+	"github.com/aibrix/aibrix/pkg/cache"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/klog/v2"
+)
+
+type leastBusyTimeRouter struct {
+	cache *cache.Cache
+}
+
+func NewLeastBusyTimeRouter() Router {
+	cacheFetched, err := cache.GetCache()
+	if err != nil {
+		panic(err)
+	}
+
+	return leastBusyTimeRouter{
+		cache: cacheFetched,
+	}
+}
+
+func (r leastBusyTimeRouter) Route(ctx context.Context, pods map[string]*v1.Pod, model string) (string, error) {
+	var targetPodIP string
+	minBusyTimeRatio := math.MaxFloat64 // <= 1 in general
+
+	if len(pods) == 0 {
+		return "", fmt.Errorf("no pods to forward request")
+	}
+
+	for _, pod := range pods {
+		if pod.Status.PodIP == "" {
+			continue
+		}
+
+		busyTimeRatio, err := r.cache.GetPodMetric(pod.Name, "gpu_busy_time_ratio") // todo: replace mock
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatio.GetSimpleValue())
+
+		if busyTimeRatio.GetSimpleValue() < minBusyTimeRatio {
+			minBusyTimeRatio = busyTimeRatio.GetSimpleValue()
+			targetPodIP = pod.Status.PodIP
+		}
+	}
+
+	// Use fallback if no valid metrics
+	if targetPodIP == "" {
+		klog.Warning("No pods with valid metrics found; selecting a pod randomly as fallback")
+		var err error
+		targetPodIP, err = selectRandomPod(pods, rand.Intn)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	if targetPodIP == "" {
+		return "", fmt.Errorf("no pods to forward request")
+	}
+
+	return targetPodIP + ":" + podMetricPort, nil
+}
diff --git a/pkg/plugins/gateway/algorithms/least_kv_cache.go b/pkg/plugins/gateway/algorithms/least_kv_cache.go
@@ -0,0 +1,96 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package routingalgorithms
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"math/rand"
+
+	"github.com/aibrix/aibrix/pkg/cache"
+	metrics "github.com/aibrix/aibrix/pkg/metrics"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/klog/v2"
+)
+
+type leastKvCacheRouter struct {
+	cache *cache.Cache
+}
+
+func NewLeastKvCacheRouter() Router {
+	cache, err := cache.GetCache()
+	if err != nil {
+		panic(err)
+	}
+
+	return leastKvCacheRouter{
+		cache: cache,
+	}
+}
+
+func (r leastKvCacheRouter) Route(ctx context.Context, pods map[string]*v1.Pod, model string) (string, error) {
+	var targetPodIP string
+	minKvCache := math.MaxFloat64
+
+	if len(pods) == 0 {
+		return "", fmt.Errorf("no pods to forward request")
+	}
+
+	for _, pod := range pods {
+		if pod.Status.PodIP == "" {
+			continue
+		}
+
+		gpuCache, err := r.cache.GetPodMetric(pod.Name, metrics.GPUCacheUsagePerc)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		cpuCache, err := r.cache.GetPodMetric(pod.Name, metrics.CPUCacheUsagePerc)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		totalCache := gpuCache.GetSimpleValue() + cpuCache.GetSimpleValue()
+
+		klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v, cpuCache: %v, kaCache: %v",
+			pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue(), cpuCache.GetSimpleValue(), totalCache)
+
+		if totalCache <= minKvCache {
+			minKvCache = totalCache
+			targetPodIP = pod.Status.PodIP
+		}
+	}
+
+	// Use fallback if no valid metrics
+	if targetPodIP == "" {
+		klog.Warning("No pods with valid metrics found; selecting a pod randomly as fallback")
+		var err error
+		targetPodIP, err = selectRandomPod(pods, rand.Intn)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	if targetPodIP == "" {
+		return "", fmt.Errorf("no pods to forward request")
+	}
+
+	klog.V(4).Infof("targetPodIP: %v", targetPodIP)
+	return targetPodIP + ":" + podMetricPort, nil
+}
diff --git a/pkg/plugins/gateway/algorithms/least_latency.go b/pkg/plugins/gateway/algorithms/least_latency.go
@@ -0,0 +1,150 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package routingalgorithms
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"math/rand"
+
+	"github.com/aibrix/aibrix/pkg/cache"
+	"github.com/aibrix/aibrix/pkg/metrics"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/klog/v2"
+)
+
+type leastExpectedLatencyRouter struct {
+	cache *cache.Cache
+}
+
+func NewLeastExpectedLatencyRouter() Router {
+	cache, err := cache.GetCache()
+	if err != nil {
+		panic(err)
+	}
+
+	return leastExpectedLatencyRouter{
+		cache: cache,
+	}
+}
+
+func (r leastExpectedLatencyRouter) Route(ctx context.Context, pods map[string]*v1.Pod, model string) (string, error) {
+	var targetPodIP string
+	minExpectedLatency := math.MaxFloat64
+
+	if len(pods) == 0 {
+		return "", fmt.Errorf("no pods to forward request")
+	}
+
+	sumPromptTokens := 0.0
+	sumGenerationTokens := 0.0
+	cntPromt := 0
+	cntGeneration := 0
+	for _, pod := range pods {
+		avgPromptTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgPromptToksPerReq)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		avgGenerationTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgGenerationToksPerReq)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		if avgPromptTokens.GetSimpleValue() > 0 {
+			sumPromptTokens += avgPromptTokens.GetSimpleValue()
+			cntPromt += 1
+		}
+		if avgGenerationTokens.GetSimpleValue() > 0 {
+			sumGenerationTokens += avgGenerationTokens.GetSimpleValue()
+			cntGeneration += 1
+		}
+	}
+	guessPromptTokens := 10.0
+	if cntPromt > 0 {
+		guessPromptTokens = sumPromptTokens / float64(cntPromt)
+	}
+	guessGenerationTokens := 100.0
+	if cntGeneration > 0 {
+		guessGenerationTokens = sumGenerationTokens / float64(cntGeneration)
+	}
+
+	for _, pod := range pods {
+		if pod.Status.PodIP == "" {
+			continue
+		}
+
+		// expected queuing latency
+		queuingLatency, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.RequestQueueTimeSeconds)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+
+		// expected prefill latency
+		avgPromptTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgPromptToksPerReq)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		PrefillTime, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.RequestPrefillTimeSeconds)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		prefillLatency := PrefillTime.GetHistogramValue().GetMean() / avgPromptTokens.GetSimpleValue() * guessPromptTokens
+
+		// expected decode latency
+		avgGenerationTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgGenerationToksPerReq)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		DecodeTime, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.RequestDecodeTimeSeconds)
+		if err != nil {
+			klog.Error(err)
+			continue
+		}
+		decodeLatency := DecodeTime.GetHistogramValue().GetMean() / avgGenerationTokens.GetSimpleValue() * guessGenerationTokens
+
+		totalExpectedLatency := queuingLatency.GetSimpleValue() + prefillLatency + decodeLatency
+		klog.V(4).Infof("pod: %v, podIP: %v, queuingLatency: %v, prefillLatency: %v, decodeLatency: %v, totalExpectedLatency: %v",
+			pod.Name, pod.Status.PodIP, queuingLatency.GetSimpleValue(), prefillLatency, decodeLatency, totalExpectedLatency)
+
+		if totalExpectedLatency <= minExpectedLatency {
+			minExpectedLatency = totalExpectedLatency
+			targetPodIP = pod.Status.PodIP
+		}
+	}
+
+	// Use fallback if no valid metrics
+	if targetPodIP == "" {
+		klog.Warning("No pods with valid metrics found; selecting a pod randomly as fallback")
+		var err error
+		targetPodIP, err = selectRandomPod(pods, rand.Intn)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	if targetPodIP == "" {
+		return "", fmt.Errorf("no pods to forward request")
+	}
+
+	return targetPodIP + ":" + podMetricPort, nil
+}
diff --git a/pkg/plugins/gateway/gateway.go b/pkg/plugins/gateway/gateway.go
@@ -50,7 +50,7 @@ import (
 var (
 	defaultRPM           = 100
 	defaultTPMMultiplier = 1000
-	routingStrategies    = []string{"random", "least-request", "throughput"}
+	routingStrategies    = []string{"random", "least-request", "throughput", "least-kv-cache", "least-busy-time", "least-latency"}
 )
 
 type Server struct {
@@ -69,9 +69,12 @@ func NewServer(redisClient *redis.Client, c kubernetes.Interface) *Server {
 	}
 	r := ratelimiter.NewRedisAccountRateLimiter("aibrix", redisClient, 1*time.Minute)
 	routers := map[string]routing.Router{
-		"random":        routing.NewRandomRouter(),
-		"least-request": routing.NewLeastRequestRouter(),
-		"throughput":    routing.NewThroughputRouter(),
+		"random":          routing.NewRandomRouter(),
+		"least-request":   routing.NewLeastRequestRouter(),
+		"throughput":      routing.NewThroughputRouter(),
+		"least-kv-cache":  routing.NewLeastKvCacheRouter(),
+		"least-busy-time": routing.NewLeastBusyTimeRouter(),
+		"least-latency":   routing.NewLeastBusyTimeRouter(),
 	}
 
 	return &Server{
@@ -535,6 +538,12 @@ func (s *Server) selectTargetPod(ctx context.Context, routingStrategy string, po
 		route = s.routers[routingStrategy]
 	case "throughput":
 		route = s.routers[routingStrategy]
+	case "least-kv-cache":
+		route = s.routers[routingStrategy]
+	case "least-busy-time":
+		route = s.routers[routingStrategy]
+	case "least-latency":
+		route = s.routers[routingStrategy]
 	default:
 		route = s.routers["random"]
 	}