-
Notifications
You must be signed in to change notification settings - Fork 266
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add request routers - least kv cache, least expected latency (#543)
* Add random adapter scheduler * Add leastExpectedLatency request router * Add least latency scheduler * Add least kv cache router * Add bin packing scheduler (first-fit as examole) * Add least utilization scheduler (RPM, TPM, kv_cache, busy_time as utilization) * Add least busy time (or least gpu utilization) router * Add weighted round robin router * Add metrics that scheduling needed (#486) * add scheduler metrics * add metrics into mock app * refact CacheUsagePerc of CPU and GPU * add instance label into promQL * 适配metrics接口 Change-Id: Icc2a017cb2db445fb760ced2c0034a65f9b37fa8 * add .vscode to gitignore Change-Id: I36a0f54ca1c8a3c16b89c0077df77a119440bed3 * fix mock cpu_cache_usage_perc metrics * feat: add least kv cache into route strategy * add 2 new routers * rm stateful router: weighted round robin * rm scheduler changes --------- Co-authored-by: chenbinbin <chenbinbin.1996@bytedance.com> Co-authored-by: chenzuzhi <chenzuzhi@bytedance.com> Co-authored-by: brosoul <brosoul@126.com>
- Loading branch information
1 parent
b479e56
commit ca7b372
Showing
5 changed files
with
347 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
/* | ||
Copyright 2024 The Aibrix Team. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package routingalgorithms | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"math" | ||
"math/rand" | ||
|
||
"github.com/aibrix/aibrix/pkg/cache" | ||
v1 "k8s.io/api/core/v1" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
type leastBusyTimeRouter struct { | ||
cache *cache.Cache | ||
} | ||
|
||
func NewLeastBusyTimeRouter() Router { | ||
cacheFetched, err := cache.GetCache() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
return leastBusyTimeRouter{ | ||
cache: cacheFetched, | ||
} | ||
} | ||
|
||
func (r leastBusyTimeRouter) Route(ctx context.Context, pods map[string]*v1.Pod, model string) (string, error) { | ||
var targetPodIP string | ||
minBusyTimeRatio := math.MaxFloat64 // <= 1 in general | ||
|
||
if len(pods) == 0 { | ||
return "", fmt.Errorf("no pods to forward request") | ||
} | ||
|
||
for _, pod := range pods { | ||
if pod.Status.PodIP == "" { | ||
continue | ||
} | ||
|
||
busyTimeRatio, err := r.cache.GetPodMetric(pod.Name, "gpu_busy_time_ratio") // todo: replace mock | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
klog.V(4).Infof("pod: %v, podIP: %v, GPU busy time ratio: %v", pod.Name, pod.Status.PodIP, busyTimeRatio.GetSimpleValue()) | ||
|
||
if busyTimeRatio.GetSimpleValue() < minBusyTimeRatio { | ||
minBusyTimeRatio = busyTimeRatio.GetSimpleValue() | ||
targetPodIP = pod.Status.PodIP | ||
} | ||
} | ||
|
||
// Use fallback if no valid metrics | ||
if targetPodIP == "" { | ||
klog.Warning("No pods with valid metrics found; selecting a pod randomly as fallback") | ||
var err error | ||
targetPodIP, err = selectRandomPod(pods, rand.Intn) | ||
if err != nil { | ||
return "", err | ||
} | ||
} | ||
|
||
if targetPodIP == "" { | ||
return "", fmt.Errorf("no pods to forward request") | ||
} | ||
|
||
return targetPodIP + ":" + podMetricPort, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
/* | ||
Copyright 2024 The Aibrix Team. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package routingalgorithms | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"math" | ||
"math/rand" | ||
|
||
"github.com/aibrix/aibrix/pkg/cache" | ||
metrics "github.com/aibrix/aibrix/pkg/metrics" | ||
v1 "k8s.io/api/core/v1" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
type leastKvCacheRouter struct { | ||
cache *cache.Cache | ||
} | ||
|
||
func NewLeastKvCacheRouter() Router { | ||
cache, err := cache.GetCache() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
return leastKvCacheRouter{ | ||
cache: cache, | ||
} | ||
} | ||
|
||
func (r leastKvCacheRouter) Route(ctx context.Context, pods map[string]*v1.Pod, model string) (string, error) { | ||
var targetPodIP string | ||
minKvCache := math.MaxFloat64 | ||
|
||
if len(pods) == 0 { | ||
return "", fmt.Errorf("no pods to forward request") | ||
} | ||
|
||
for _, pod := range pods { | ||
if pod.Status.PodIP == "" { | ||
continue | ||
} | ||
|
||
gpuCache, err := r.cache.GetPodMetric(pod.Name, metrics.GPUCacheUsagePerc) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
cpuCache, err := r.cache.GetPodMetric(pod.Name, metrics.CPUCacheUsagePerc) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
totalCache := gpuCache.GetSimpleValue() + cpuCache.GetSimpleValue() | ||
|
||
klog.V(4).Infof("pod: %v, podIP: %v, gpuCache: %v, cpuCache: %v, kaCache: %v", | ||
pod.Name, pod.Status.PodIP, gpuCache.GetSimpleValue(), cpuCache.GetSimpleValue(), totalCache) | ||
|
||
if totalCache <= minKvCache { | ||
minKvCache = totalCache | ||
targetPodIP = pod.Status.PodIP | ||
} | ||
} | ||
|
||
// Use fallback if no valid metrics | ||
if targetPodIP == "" { | ||
klog.Warning("No pods with valid metrics found; selecting a pod randomly as fallback") | ||
var err error | ||
targetPodIP, err = selectRandomPod(pods, rand.Intn) | ||
if err != nil { | ||
return "", err | ||
} | ||
} | ||
|
||
if targetPodIP == "" { | ||
return "", fmt.Errorf("no pods to forward request") | ||
} | ||
|
||
klog.V(4).Infof("targetPodIP: %v", targetPodIP) | ||
return targetPodIP + ":" + podMetricPort, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
/* | ||
Copyright 2024 The Aibrix Team. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package routingalgorithms | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"math" | ||
"math/rand" | ||
|
||
"github.com/aibrix/aibrix/pkg/cache" | ||
"github.com/aibrix/aibrix/pkg/metrics" | ||
v1 "k8s.io/api/core/v1" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
type leastExpectedLatencyRouter struct { | ||
cache *cache.Cache | ||
} | ||
|
||
func NewLeastExpectedLatencyRouter() Router { | ||
cache, err := cache.GetCache() | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
return leastExpectedLatencyRouter{ | ||
cache: cache, | ||
} | ||
} | ||
|
||
func (r leastExpectedLatencyRouter) Route(ctx context.Context, pods map[string]*v1.Pod, model string) (string, error) { | ||
var targetPodIP string | ||
minExpectedLatency := math.MaxFloat64 | ||
|
||
if len(pods) == 0 { | ||
return "", fmt.Errorf("no pods to forward request") | ||
} | ||
|
||
sumPromptTokens := 0.0 | ||
sumGenerationTokens := 0.0 | ||
cntPromt := 0 | ||
cntGeneration := 0 | ||
for _, pod := range pods { | ||
avgPromptTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgPromptToksPerReq) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
avgGenerationTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgGenerationToksPerReq) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
if avgPromptTokens.GetSimpleValue() > 0 { | ||
sumPromptTokens += avgPromptTokens.GetSimpleValue() | ||
cntPromt += 1 | ||
} | ||
if avgGenerationTokens.GetSimpleValue() > 0 { | ||
sumGenerationTokens += avgGenerationTokens.GetSimpleValue() | ||
cntGeneration += 1 | ||
} | ||
} | ||
guessPromptTokens := 10.0 | ||
if cntPromt > 0 { | ||
guessPromptTokens = sumPromptTokens / float64(cntPromt) | ||
} | ||
guessGenerationTokens := 100.0 | ||
if cntGeneration > 0 { | ||
guessGenerationTokens = sumGenerationTokens / float64(cntGeneration) | ||
} | ||
|
||
for _, pod := range pods { | ||
if pod.Status.PodIP == "" { | ||
continue | ||
} | ||
|
||
// expected queuing latency | ||
queuingLatency, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.RequestQueueTimeSeconds) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
|
||
// expected prefill latency | ||
avgPromptTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgPromptToksPerReq) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
PrefillTime, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.RequestPrefillTimeSeconds) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
prefillLatency := PrefillTime.GetHistogramValue().GetMean() / avgPromptTokens.GetSimpleValue() * guessPromptTokens | ||
|
||
// expected decode latency | ||
avgGenerationTokens, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.AvgGenerationToksPerReq) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
DecodeTime, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.RequestDecodeTimeSeconds) | ||
if err != nil { | ||
klog.Error(err) | ||
continue | ||
} | ||
decodeLatency := DecodeTime.GetHistogramValue().GetMean() / avgGenerationTokens.GetSimpleValue() * guessGenerationTokens | ||
|
||
totalExpectedLatency := queuingLatency.GetSimpleValue() + prefillLatency + decodeLatency | ||
klog.V(4).Infof("pod: %v, podIP: %v, queuingLatency: %v, prefillLatency: %v, decodeLatency: %v, totalExpectedLatency: %v", | ||
pod.Name, pod.Status.PodIP, queuingLatency.GetSimpleValue(), prefillLatency, decodeLatency, totalExpectedLatency) | ||
|
||
if totalExpectedLatency <= minExpectedLatency { | ||
minExpectedLatency = totalExpectedLatency | ||
targetPodIP = pod.Status.PodIP | ||
} | ||
} | ||
|
||
// Use fallback if no valid metrics | ||
if targetPodIP == "" { | ||
klog.Warning("No pods with valid metrics found; selecting a pod randomly as fallback") | ||
var err error | ||
targetPodIP, err = selectRandomPod(pods, rand.Intn) | ||
if err != nil { | ||
return "", err | ||
} | ||
} | ||
|
||
if targetPodIP == "" { | ||
return "", fmt.Errorf("no pods to forward request") | ||
} | ||
|
||
return targetPodIP + ":" + podMetricPort, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters