diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go index e9e4214f..a0e7d794 100644 --- a/pkg/cache/cache.go +++ b/pkg/cache/cache.go @@ -87,8 +87,6 @@ var ( metrics.AvgGenerationThroughputToksPerS, metrics.GPUCacheUsagePerc, metrics.CPUCacheUsagePerc, - metrics.GPUCacheUsagePerc, - metrics.CPUCacheUsagePerc, } // histogram metric example - time_to_first_token_seconds, _sum, _bucket _count. histogramMetricNames = []string{ diff --git a/pkg/plugins/gateway/algorithms/least_kv_cache.go b/pkg/plugins/gateway/algorithms/least_kv_cache.go index b0e6e648..ea941a33 100644 --- a/pkg/plugins/gateway/algorithms/least_kv_cache.go +++ b/pkg/plugins/gateway/algorithms/least_kv_cache.go @@ -56,12 +56,15 @@ func (r leastKvCacheRouter) Route(ctx context.Context, pods map[string]*v1.Pod, continue } - gpuCache, err := r.cache.GetPodMetric(pod.Name, metrics.GPUCacheUsagePerc) + // Due to metric refactor (pull/543) to better support lora and multi models, + // we change to use PodModelMetrics instead of PodMetrics in some scenarios. + // This works but doesn't look very promising, we can revisit this part later. + gpuCache, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.GPUCacheUsagePerc) if err != nil { klog.Error(err) continue } - cpuCache, err := r.cache.GetPodMetric(pod.Name, metrics.CPUCacheUsagePerc) + cpuCache, err := r.cache.GetPodModelMetric(pod.Name, model, metrics.CPUCacheUsagePerc) if err != nil { klog.Error(err) continue