diff --git a/api/orchestration/v1alpha1/kvcache_types.go b/api/orchestration/v1alpha1/kvcache_types.go index 717a8394..e6b47ba8 100644 --- a/api/orchestration/v1alpha1/kvcache_types.go +++ b/api/orchestration/v1alpha1/kvcache_types.go @@ -95,7 +95,7 @@ type CacheSpec struct { // the memory resources of kvcache container // +kubebuilder:validation:Optional - // +kubebuilder:default:="2" + // +kubebuilder:default:="2Gi" Memory string `json:"memory,omitempty"` // the cpu resources of kvcache container diff --git a/docs/source/features/distributed-kv-cache.rst b/docs/source/features/distributed-kv-cache.rst index 39327514..59fc49b6 100644 --- a/docs/source/features/distributed-kv-cache.rst +++ b/docs/source/features/distributed-kv-cache.rst @@ -30,10 +30,13 @@ After deployment, we can see all the components by using ``kubectl get pods -n a .. code-block:: RST NAME READY STATUS RESTARTS AGE - deepseek-coder-7b-kvcache-596965997-p86cx 1/1 Running 0 2m + deepseek-coder-7b-kvcache-596965997-p86cx 0/1 Pending 0 2m deepseek-coder-7b-kvcache-etcd-0 1/1 Running 0 2m -After all components are running, we can use the following yaml to deploy the inference service: +.. note:: + ``deepseek-coder-7b-kvcache-596965997-p86cx`` is pending and waiting for inference engine to be deployed, this is normal. + +After all components are created, we can use the following yaml to deploy the inference service: .. literalinclude:: ../../../samples/kvcache/deployment.yaml :language: yaml diff --git a/docs/source/features/lora-dynamic-loading.rst b/docs/source/features/lora-dynamic-loading.rst index 933be23e..49d3f52a 100644 --- a/docs/source/features/lora-dynamic-loading.rst +++ b/docs/source/features/lora-dynamic-loading.rst @@ -113,10 +113,6 @@ Send request using lora model name to the gateway. .. code-block:: bash - # Expose endpoint - LB_IP=$(kubectl get svc/envoy-aibrix-system-aibrix-eg-903790dc -n envoy-gateway-system -o=jsonpath='{.status.loadBalancer.ingress[0].ip}') - ENDPOINT="${LB_IP}:80" - # send request to base model curl -v http://${ENDPOINT}/v1/completions \ -H "Content-Type: application/json" \ @@ -168,7 +164,9 @@ User may pass in the argument ``--api-key`` or environment variable ``VLLM_API_K .. code-block:: bash - python3 -m vllm.entrypoints.openai.api_server --api-key test-key-1234567890 + python3 -m vllm.entrypoints.openai.api_server --api-key sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake + +We already have an example and you can ``kubectl apply -f samples/adapter/adapter-with-key.yaml``. In that case, lora model adapter can not query the vLLM server correctly, showing ``{"error":"Unauthorized"}`` error. You need to update ``additionalConfig`` field to pass in the API key. @@ -176,6 +174,23 @@ In that case, lora model adapter can not query the vLLM server correctly, showin .. literalinclude:: ../../../samples/adapter/adapter-api-key.yaml :language: yaml + +You need to send the request with ``--header 'Authorization: Bearer your-api-key'`` + +.. code-block:: bash + + # send request to base model + curl -v http://${ENDPOINT}/v1/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake" \ + -d '{ + "model": "qwen-code-lora-with-key", + "prompt": "San Francisco is a", + "max_tokens": 128, + "temperature": 0 + }' + + Runtime Support Sidecar ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/samples/adapter/adapter-api-key.yaml b/samples/adapter/adapter-api-key.yaml index b7cb2115..f85e083f 100644 --- a/samples/adapter/adapter-api-key.yaml +++ b/samples/adapter/adapter-api-key.yaml @@ -1,10 +1,10 @@ apiVersion: model.aibrix.ai/v1alpha1 kind: ModelAdapter metadata: - name: qwen-code-lora + name: qwen-code-lora-with-key namespace: default labels: - model.aibrix.ai/name: "qwen-code-lora" + model.aibrix.ai/name: "qwen-code-lora-with-key" model.aibrix.ai/port: "8000" spec: baseModel: qwen-coder-1-5b-instruct @@ -13,5 +13,5 @@ spec: model.aibrix.ai/name: qwen-coder-1-5b-instruct artifactURL: huggingface://ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora additionalConfig: - api-key: test-key-1234567890 + api-key: sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake schedulerName: default diff --git a/samples/adapter/base-api-key.yaml b/samples/adapter/base-api-key.yaml new file mode 100644 index 00000000..da1d7663 --- /dev/null +++ b/samples/adapter/base-api-key.yaml @@ -0,0 +1,103 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + model.aibrix.ai/name: qwen-coder-1-5b-instruct # Note: The label value `model.aibrix.ai/name` here must match with the service name. + model.aibrix.ai/port: "8000" + adapter.model.aibrix.ai/enabled: "true" + name: qwen-coder-1-5b-instruct + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: qwen-coder-1-5b-instruct + template: + metadata: + labels: + model.aibrix.ai/name: qwen-coder-1-5b-instruct + spec: + containers: + - command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8000" + - --model + - Qwen/Qwen2.5-Coder-1.5B-Instruct + - --served-model-name + # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name` + - qwen-coder-1-5b-instruct + - --api-key + - sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake + - --enable-lora + image: vllm/vllm-openai:v0.7.1 + imagePullPolicy: Always + name: vllm-openai + env: + - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING + value: "True" + ports: + - containerPort: 8000 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + - name: aibrix-runtime + image: aibrix/runtime:v0.2.0 + command: + - aibrix_runtime + - --port + - "8080" + env: + - name: INFERENCE_ENGINE + value: vllm + - name: INFERENCE_ENGINE_ENDPOINT + value: http://localhost:8000 + ports: + - containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 2 + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + model.aibrix.ai/name: qwen-coder-1-5b-instruct + prometheus-discovery: "true" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + name: qwen-coder-1-5b-instruct # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment + namespace: default +spec: + ports: + - name: serve + port: 8000 + protocol: TCP + targetPort: 8000 + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + model.aibrix.ai/name: qwen-coder-1-5b-instruct + type: ClusterIP diff --git a/samples/autoscaling/apa.yaml b/samples/autoscaling/apa.yaml index de25aeab..56b1e1bc 100644 --- a/samples/autoscaling/apa.yaml +++ b/samples/autoscaling/apa.yaml @@ -1,7 +1,7 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: aibrix-model-deepseek-llm-7b-chat-apa + name: deepseek-r1-distill-llama-8b-apa namespace: default labels: app.kubernetes.io/name: aibrix @@ -24,4 +24,4 @@ spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: aibrix-model-deepseek-llm-7b-chat + name: deepseek-r1-distill-llama-8b diff --git a/samples/autoscaling/deploy.yaml b/samples/autoscaling/deploy.yaml index 11cba442..dda75c4a 100644 --- a/samples/autoscaling/deploy.yaml +++ b/samples/autoscaling/deploy.yaml @@ -2,15 +2,15 @@ apiVersion: apps/v1 kind: Deployment metadata: labels: - model.aibrix.ai/name: deepseek-llm-7b-chat + model.aibrix.ai/name: deepseek-r1-distill-llama-8b model.aibrix.ai/port: "8000" - name: aibrix-model-deepseek-llm-7b-chat + name: deepseek-r1-distill-llama-8b namespace: default spec: replicas: 1 selector: matchLabels: - model.aibrix.ai/name: deepseek-llm-7b-chat + model.aibrix.ai/name: deepseek-r1-distill-llama-8b strategy: type: Recreate template: @@ -20,7 +20,7 @@ spec: prometheus.io/port: "8000" prometheus.io/path: "/metrics" labels: - model.aibrix.ai/name: deepseek-llm-7b-chat + model.aibrix.ai/name: deepseek-r1-distill-llama-8b spec: containers: - command: @@ -32,12 +32,13 @@ spec: - --port - "8000" - --model - - deepseek-ai/deepseek-coder-6.7b-instruct + - deepseek-ai/DeepSeek-R1-Distill-Llama-8B - --served-model-name - - deepseek-llm-7b-chat - - --trust-remote-code + - deepseek-r1-distill-llama-8b - --dtype - half + - --max-model-len + - "12288" # 24k length, this is to avoid "The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache" issue. image: vllm/vllm-openai:v0.7.1 imagePullPolicy: IfNotPresent livenessProbe: @@ -46,7 +47,7 @@ spec: path: /health port: 8000 scheme: HTTP - initialDelaySeconds: 90 + initialDelaySeconds: 120 periodSeconds: 5 successThreshold: 1 timeoutSeconds: 1 @@ -55,12 +56,12 @@ spec: - containerPort: 8000 protocol: TCP readinessProbe: - failureThreshold: 3 + failureThreshold: 5 httpGet: path: /health port: 8000 scheme: HTTP - initialDelaySeconds: 90 + initialDelaySeconds: 120 periodSeconds: 5 successThreshold: 1 timeoutSeconds: 1 @@ -69,3 +70,30 @@ spec: nvidia.com/gpu: "1" requests: nvidia.com/gpu: "1" + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + prometheus-discovery: "true" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + name: deepseek-r1-distill-llama-8b # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment + namespace: default +spec: + ports: + - name: serve + port: 8000 + protocol: TCP + targetPort: 8000 + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + type: ClusterIP diff --git a/samples/autoscaling/hpa.yaml b/samples/autoscaling/hpa.yaml index f30ff206..abcde92f 100644 --- a/samples/autoscaling/hpa.yaml +++ b/samples/autoscaling/hpa.yaml @@ -1,7 +1,7 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: aibrix-model-deepseek-llm-7b-chat-hpa + name: deepseek-r1-distill-llama-8b-hpa namespace: default labels: app.kubernetes.io/name: aibrix @@ -20,4 +20,4 @@ spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: aibrix-model-deepseek-llm-7b-chat + name: deepseek-r1-distill-llama-8b diff --git a/samples/autoscaling/kpa.yaml b/samples/autoscaling/kpa.yaml index 03831aae..01304391 100644 --- a/samples/autoscaling/kpa.yaml +++ b/samples/autoscaling/kpa.yaml @@ -1,7 +1,7 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: aibrix-model-deepseek-llm-7b-chat-kpa + name: deepseek-r1-distill-llama-8b-kpa namespace: default labels: app.kubernetes.io/name: aibrix @@ -22,4 +22,4 @@ spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: aibrix-model-deepseek-llm-7b-chat + name: deepseek-r1-distill-llama-8b diff --git a/samples/distributed/fleet-two-node.yaml b/samples/distributed/fleet-two-node.yaml index 850eea4c..a08b9692 100644 --- a/samples/distributed/fleet-two-node.yaml +++ b/samples/distributed/fleet-two-node.yaml @@ -4,12 +4,12 @@ metadata: labels: app.kubernetes.io/name: aibrix app.kubernetes.io/managed-by: kustomize - name: facebook-opt-13b + name: qwen-coder-7b-instruct spec: replicas: 1 selector: matchLabels: - model.aibrix.ai/name: facebook-opt-13b + model.aibrix.ai/name: qwen-coder-7b-instruct strategy: rollingUpdate: maxSurge: 25% @@ -18,7 +18,7 @@ spec: template: metadata: labels: - model.aibrix.ai/name: facebook-opt-13b + model.aibrix.ai/name: qwen-coder-7b-instruct annotations: ray.io/overwrite-container-cmd: "true" spec: @@ -41,7 +41,7 @@ spec: - containerPort: 8000 name: service command: ["/bin/bash", "-lc", "--"] - args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray"] + args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve Qwen/Qwen2.5-Coder-7B-Instruct --tensor-parallel-size 2 --distributed-executor-backend ray"] resources: limits: cpu: "4" diff --git a/samples/kvcache/deployment-tp.yaml b/samples/kvcache/deployment-tp.yaml index 2a391999..10f51b01 100644 --- a/samples/kvcache/deployment-tp.yaml +++ b/samples/kvcache/deployment-tp.yaml @@ -57,7 +57,7 @@ spec: - name: AIBRIX_LLM_KV_CACHE_SOCKET value: /var/run/vineyard.sock - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT - value: "aibrix-model-deepseek-coder-33b-kvcache-rpc:9600" + value: "deepseek-coder-33b-kvcache-rpc:9600" - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE value: "1" - name: "VINEYARD_CACHE_METRICS_ENABLED" @@ -73,4 +73,31 @@ spec: volumes: - name: kvcache-socket hostPath: - path: /var/run/vineyard-kubernetes/default/deepseek-coder-33b-kvcache \ No newline at end of file + path: /var/run/vineyard-kubernetes/default/deepseek-coder-33b-kvcache + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + model.aibrix.ai/name: deepseek-coder-33b-instruct + prometheus-discovery: "true" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + name: deepseek-coder-33b-instruct # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment + namespace: default +spec: + ports: + - name: serve + port: 8000 + protocol: TCP + targetPort: 8000 + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + model.aibrix.ai/name: deepseek-coder-33b-instruct + type: ClusterIP diff --git a/samples/kvcache/deployment.yaml b/samples/kvcache/deployment.yaml index ff752e61..c363cc55 100644 --- a/samples/kvcache/deployment.yaml +++ b/samples/kvcache/deployment.yaml @@ -35,7 +35,7 @@ spec: - --served-model-name - deepseek-coder-7b-instruct - --max-model-len - - "12288" + - "8192" # please modify this field if your gpu has more room - --enable-prefix-caching - --disable-fastapi-docs env: @@ -52,7 +52,7 @@ spec: - name: AIBRIX_LLM_KV_CACHE_SOCKET value: /var/run/vineyard.sock - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT - value: "aibrix-model-deepseek-coder-7b-kvcache-rpc:9600" + value: "deepseek-coder-7b-kvcache-rpc:9600" - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE value: "1" - name: "VINEYARD_CACHE_METRICS_ENABLED" @@ -69,3 +69,30 @@ spec: - name: kvcache-socket hostPath: path: /var/run/vineyard-kubernetes/default/deepseek-coder-7b-kvcache + +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + model.aibrix.ai/name: deepseek-coder-7b-instruct + prometheus-discovery: "true" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + name: deepseek-coder-7b-instruct # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment + namespace: default +spec: + ports: + - name: serve + port: 8000 + protocol: TCP + targetPort: 8000 + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + model.aibrix.ai/name: deepseek-coder-7b-instruct + type: ClusterIP diff --git a/samples/kvcache/kvcache-tp.yaml b/samples/kvcache/kvcache-tp.yaml index d69eb674..b5c4d278 100644 --- a/samples/kvcache/kvcache-tp.yaml +++ b/samples/kvcache/kvcache-tp.yaml @@ -14,3 +14,5 @@ spec: cacheSpec: image: aibrix/vineyardd:20241120 imagePullPolicy: IfNotPresent + cpu: 2000m + memory: 4Gi \ No newline at end of file diff --git a/samples/kvcache/kvcache.yaml b/samples/kvcache/kvcache.yaml index fb178195..a841f937 100644 --- a/samples/kvcache/kvcache.yaml +++ b/samples/kvcache/kvcache.yaml @@ -13,3 +13,5 @@ spec: cacheSpec: image: aibrix/vineyardd:20241120 imagePullPolicy: IfNotPresent + cpu: 2000m + memory: 4Gi diff --git a/samples/quickstart/model.yaml b/samples/quickstart/model.yaml index 39eb5133..e5799782 100644 --- a/samples/quickstart/model.yaml +++ b/samples/quickstart/model.yaml @@ -33,7 +33,7 @@ spec: - --max-model-len - "12288" # 24k length, this is to avoid "The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache" issue. image: vllm/vllm-openai:v0.7.1 - imagePullPolicy: Always + imagePullPolicy: IfNotPresent name: vllm-openai ports: - containerPort: 8000 @@ -43,6 +43,27 @@ spec: nvidia.com/gpu: "1" requests: nvidia.com/gpu: "1" + livenessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 120 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 5 + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 120 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + --- apiVersion: v1