vllm-project · Jeffwan · Oct 5, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/config/samples/autoscaling_v1alpha1_mock_llama_apa.yaml b/config/samples/autoscaling_v1alpha1_mock_llama_apa.yaml
@@ -0,0 +1,18 @@
+apiVersion: autoscaling.aibrix.ai/v1alpha1
+kind: PodAutoscaler
+metadata:
+  name: podautoscaler-example-mock-llama-apa
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+  namespace: aibrix-system
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: llama2-70b
+  minReplicas: 1
+  maxReplicas: 10
+  targetMetric: "avg_prompt_throughput_toks_per_s"
+  targetValue: "20"
+  scalingStrategy: "APA"
diff --git a/docs/development/app/app.py b/docs/development/app/app.py
@@ -11,7 +11,7 @@
 
 MODEL_NAME = 'llama2-70b'
 DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'llama2-70b')
-NAMESPACE = os.getenv('NAMESPACE', 'default')
+NAMESPACE = os.getenv('NAMESPACE', 'aibrix-system')
 DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1'))
 
 models = [

diff --git a/docs/development/app/deployment.yaml b/docs/development/app/deployment.yaml
@@ -53,6 +53,60 @@ spec:
       targetPort: 8000
       nodePort: 30081
   type: NodePort
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: pod-autoscaler
+  namespace: aibrix-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: pod-reader
+  namespace: aibrix-system
+rules:
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: read-pods
+  namespace: aibrix-system
+subjects:
+  - kind: ServiceAccount
+    name: pod-autoscaler
+    namespace: aibrix-system
+roleRef:
+  kind: Role
+  name: pod-reader
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: aibrix-system
+  name: deployment-reader
+rules:
+  - apiGroups: ["apps"]
+    resources: ["deployments"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: deployment-reader-binding
+  namespace: aibrix-system
+subjects:
+  - kind: ServiceAccount
+    name: pod-autoscaler
+    namespace: aibrix-system
+roleRef:
+  kind: Role
+  name: deployment-reader
+  apiGroup: rbac.authorization.k8s.io
 # ---
 # for test-purpose, if need to create HTTPRoute object manually
 # apiVersion: gateway.networking.k8s.io/v1

diff --git a/docs/tutorial/podautoscaler/README.md b/docs/tutorial/podautoscaler/README.md
@@ -55,7 +55,7 @@ Starting workers	{"controller": "podautoscaler", "controllerGroup": "autoscaling
 For debugging purposes, you can expose the ports in Kubernetes using the following command:
 
 ```shell
-kubectl port-forward svc/llama2-70b 8000:8000
+kubectl port-forward svc/llama2-70b 8000:8000 -n aibrix-system
 ```
 
 ## Start 2: Build and Deploy Manager
@@ -308,6 +308,80 @@ kubectl get pods -n aibrix-system -o name | grep aibrix-controller-manager | hea
 The Mocked Llama has an average prompt throughput of 100 tokens per second (`avg_prompt_throughput_toks_per_s`). The AutoScaler aims to maintain each pod's metrics at 20. As indicated in the events, the KPA podautoscaler adjusted the replicas from 3 to 5.
 
 
+
+# Case 4: Create a APA-Based AIBrix Pod Autoscaler on Mocked Llama
+
+## Launching Mocked Llama
+
+The Mocked Llama is a simulation of a vllm-based Llama deployment. It provides mocked metrics for scaling purposes, following the standard Prometheus protocol.
+
+For a detailed introduction, refer to the [README.md](../../development/app/README.md).
+
+### Deployment on K8S
+
+Deploy using the following commands:
+
+```shell
+kubectl apply -f docs/development/app/deployment.yaml
+kubectl get deployments --all-namespaces |grep llama2
+```
+
+You should see the deployment status similar to this:
+
+```log
+NAME         READY   UP-TO-DATE   AVAILABLE   AGE
+llama2-70b   3/3     3            3           16s
+```
+
+## Autoscaling
+
+If you have created other autoscaler on this mocked llama deployment, deleted them first:
+```shell
+kubectl delete podautoscalers.autoscaling.aibrix.ai podautoscaler-example-mock-llama -n aibrix-system
+kubectl delete podautoscalers.autoscaling.aibrix.ai podautoscaler-example-mock-llama-apa -n aibrix-system
+```
+
+Create an autoscaler of type APA:
+
+```shell
+kubectl apply -f config/samples/autoscaling_v1alpha1_mock_llama_apa.yaml
+kubectl get podautoscalers --all-namespaces
+```
+
+```log
+NAMESPACE       NAME                                   AGE
+aibrix-system   podautoscaler-example-mock-llama-apa   65m
+```
+
+## Scaling Result, Logs and Events
+
+
+```shell
+kubectl get deployments --all-namespaces |grep llama2
+```
+
+The deployment has been rescaled to 5 replicas:
+
+```log
+aibrix-system   llama2-70b                  5/5     5            5           65m
+```
+
+Check the events of APA podautoscalers, you can see the scaling details:
+
+```shell
+kubectl describe podautoscalers podautoscaler-example-mock-llama-apa -n aibrix-system
+```
+
+```log
+Events:
+  Type    Reason             Age   From           Message
+  ----    ------             ----  ----           -------
+  Normal  AlgorithmRun       78s   PodAutoscaler  APA algorithm run. currentReplicas: 3, desiredReplicas: 5, rescale: true
+  Normal  SuccessfulRescale  78s   PodAutoscaler  New size: 5; reason: avg_prompt_throughput_toks_per_s above target
+  Normal  AlgorithmRun       77s   PodAutoscaler  APA algorithm run. currentReplicas: 5, desiredReplicas: 5, rescale: false
+```
+
+
 # Cleanup
 
 To clean up the resources:
@@ -316,6 +390,7 @@ To clean up the resources:
 # Remove AIBrix resources
 kubectl delete podautoscalers.autoscaling.aibrix.ai podautoscaler-example
 kubectl delete podautoscalers.autoscaling.aibrix.ai podautoscaler-example-mock-llama -n aibrix-system
+kubectl delete podautoscalers.autoscaling.aibrix.ai podautoscaler-example-mock-llama-apa -n aibrix-system
 
 make uninstall && make undeploy
 

diff --git a/pkg/controller/podautoscaler/podautoscaler_controller.go b/pkg/controller/podautoscaler/podautoscaler_controller.go
@@ -144,10 +144,8 @@ func (r *PodAutoscalerReconciler) Reconcile(ctx context.Context, req ctrl.Reques
 	switch pa.Spec.ScalingStrategy {
 	case autoscalingv1alpha1.HPA:
 		return r.reconcileHPA(ctx, pa)
-	case autoscalingv1alpha1.KPA:
+	case autoscalingv1alpha1.KPA, autoscalingv1alpha1.APA:
 		return r.reconcileKPA(ctx, pa)
-	case autoscalingv1alpha1.APA:
-		return r.reconcileAPA(ctx, pa)
 	default:
 		return ctrl.Result{}, fmt.Errorf("unknown autoscaling strategy: %s", pa.Spec.ScalingStrategy)
 	}
@@ -315,10 +313,9 @@ func (r *PodAutoscalerReconciler) reconcileKPA(ctx context.Context, pa autoscali
 		rescale = desiredReplicas != currentReplicas
 	}
 
-	r.EventRecorder.Eventf(&pa, corev1.EventTypeNormal, "KPAAlgorithmRun",
-		"KPA algorithm run. currentReplicas: %d, desiredReplicas: %d, rescale: %t",
-		desiredReplicas, currentReplicas, rescale)
-
+	r.EventRecorder.Eventf(&pa, corev1.EventTypeNormal, "AlgorithmRun",
+		"%s algorithm run. currentReplicas: %d, desiredReplicas: %d, rescale: %t",
+		pa.Spec.ScalingStrategy, currentReplicas, desiredReplicas, rescale)
 	if rescale {
 
 		if err := r.updateScale(ctx, pa.Namespace, targetGR, scale, desiredReplicas); err != nil {
@@ -496,7 +493,7 @@ func (r *PodAutoscalerReconciler) computeReplicasForMetrics(ctx context.Context,
 	metricKey := metrics.NewNamespaceNameMetric(pa.Namespace, pa.Spec.ScaleTargetRef.Name, pa.Spec.TargetMetric)
 
 	// Calculate the desired number of pods using the autoscaler logic.
-	scaleResult := r.Autoscaler.Scale(int(originalReadyPodsCount), metricKey, currentTimestamp)
+	scaleResult := r.Autoscaler.Scale(int(originalReadyPodsCount), metricKey, currentTimestamp, pa.Spec.ScalingStrategy)
 	if scaleResult.ScaleValid {
 		logger.V(4).Info("Successfully called Scale Algorithm", "scaleResult", scaleResult)
 		return scaleResult.DesiredPodCount, metricKey.MetricName, currentTimestamp, nil

diff --git a/pkg/controller/podautoscaler/scaler/apa_test.go b/pkg/controller/podautoscaler/scaler/apa_test.go
@@ -0,0 +1,79 @@
+/*
+Copyright 2024 The Aibrix Team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package scaler
+
+import (
+	"testing"
+	"time"
+
+	autoscalingv1alpha1 "github.com/aibrix/aibrix/api/autoscaling/v1alpha1"
+
+	"github.com/aibrix/aibrix/pkg/controller/podautoscaler/metrics"
+)
+
+// TestHcpaScale tests the APA behavior. For now, APA implements HCPA algorithm.
+func TestAPAScale(t *testing.T) {
+	readyPodCount := 5
+	kpaMetricsClient := metrics.NewKPAMetricsClient()
+	now := time.Now()
+	metricKey := metrics.NewNamespaceNameMetric("test_ns", "llama-70b", "ttot")
+	_ = kpaMetricsClient.UpdateMetricIntoWindow(metricKey, now.Add(-60*time.Second), 10.0)
+	_ = kpaMetricsClient.UpdateMetricIntoWindow(metricKey, now.Add(-50*time.Second), 11.0)
+	_ = kpaMetricsClient.UpdateMetricIntoWindow(metricKey, now.Add(-40*time.Second), 12.0)
+	_ = kpaMetricsClient.UpdateMetricIntoWindow(metricKey, now.Add(-30*time.Second), 13.0)
+	_ = kpaMetricsClient.UpdateMetricIntoWindow(metricKey, now.Add(-20*time.Second), 14.0)
+	_ = kpaMetricsClient.UpdateMetricIntoWindow(metricKey, now.Add(-10*time.Second), 100.0)
+
+	kpaScaler, err := NewKpaAutoscaler(readyPodCount,
+		&DeciderKpaSpec{
+			MaxScaleUpRate:           2,
+			MaxScaleDownRate:         2,
+			ScalingMetric:            metricKey.MetricName,
+			TargetValue:              10,
+			TotalValue:               500,
+			PanicThreshold:           2.0,
+			StableWindow:             60 * time.Second,
+			ScaleDownDelay:           10 * time.Second,
+			ActivationScale:          2,
+			UpFluctuationTolerance:   0.1,
+			DownFluctuationTolerance: 0.2,
+		},
+	)
+	kpaScaler.metricsClient = kpaMetricsClient
+	if err != nil {
+		t.Errorf("Failed to create KpaAutoscaler: %v", err)
+	}
+	ticker := time.NewTicker(10 * time.Second)
+	defer ticker.Stop()
+
+	// test 1:
+	result := kpaScaler.Scale(readyPodCount, metricKey, now, autoscalingv1alpha1.APA)
+	// recent rapid rising metric value make scaler adapt turn on panic mode
+	if result.DesiredPodCount != 10 {
+		t.Errorf("result.DesiredPodCount = 10, got %d", result.DesiredPodCount)
+	}
+
+	// test 2:
+	// 1.1 means APA won't scale up unless current usage > TargetValue * (1+1.1), i.e. 210%
+	// In this test case with UpFluctuationTolerance = 1.1, APA will not scale up.
+	kpaScaler.deciderSpec.UpFluctuationTolerance = 1.1
+	result = kpaScaler.Scale(readyPodCount, metricKey, now, autoscalingv1alpha1.APA)
+	// recent rapid rising metric value make scaler adapt turn on panic mode
+	if result.DesiredPodCount != int32(readyPodCount) {
+		t.Errorf("result should remain previous replica = %d, but got %d", readyPodCount, result.DesiredPodCount)
+	}
+}
diff --git a/pkg/controller/podautoscaler/scaler/interface.go b/pkg/controller/podautoscaler/scaler/interface.go
@@ -20,6 +20,8 @@ import (
 	"sync"
 	"time"
 
+	autoscalingv1alpha1 "github.com/aibrix/aibrix/api/autoscaling/v1alpha1"
+
 	"github.com/aibrix/aibrix/pkg/controller/podautoscaler/metrics"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
@@ -61,7 +63,7 @@ type Scaler interface {
 	// ScaleResult which contains the recommended number of pods to scale up or down to.
 	//
 	// Refer to:  KpaAutoscaler.Scale Implementation
-	Scale(originalReadyPodsCount int, metricKey metrics.NamespaceNameMetric, now time.Time) ScaleResult
+	Scale(originalReadyPodsCount int, metricKey metrics.NamespaceNameMetric, now time.Time, strategy autoscalingv1alpha1.ScalingStrategyType) ScaleResult
 }
 
 // ScaleResult contains the results of a scaling decision.