diff --git a/api/orchestration/v1alpha1/kvcache_types.go b/api/orchestration/v1alpha1/kvcache_types.go
index 717a8394..e6b47ba8 100644
--- a/api/orchestration/v1alpha1/kvcache_types.go
+++ b/api/orchestration/v1alpha1/kvcache_types.go
@@ -95,7 +95,7 @@ type CacheSpec struct {
 
 	// the memory resources of kvcache container
 	// +kubebuilder:validation:Optional
-	// +kubebuilder:default:="2"
+	// +kubebuilder:default:="2Gi"
 	Memory string `json:"memory,omitempty"`
 
 	// the cpu resources of kvcache container
diff --git a/docs/source/features/distributed-kv-cache.rst b/docs/source/features/distributed-kv-cache.rst
index 39327514..59fc49b6 100644
--- a/docs/source/features/distributed-kv-cache.rst
+++ b/docs/source/features/distributed-kv-cache.rst
@@ -30,10 +30,13 @@ After deployment, we can see all the components by using ``kubectl get pods -n a
 .. code-block:: RST
 
     NAME                                        READY   STATUS    RESTARTS   AGE
-    deepseek-coder-7b-kvcache-596965997-p86cx   1/1     Running   0          2m
+    deepseek-coder-7b-kvcache-596965997-p86cx   0/1     Pending   0          2m
     deepseek-coder-7b-kvcache-etcd-0            1/1     Running   0          2m
 
-After all components are running, we can use the following yaml to deploy the inference service:
+.. note::
+    ``deepseek-coder-7b-kvcache-596965997-p86cx`` is pending and waiting for inference engine to be deployed, this is normal.
+
+After all components are created, we can use the following yaml to deploy the inference service:
 
 .. literalinclude:: ../../../samples/kvcache/deployment.yaml
    :language: yaml
diff --git a/docs/source/features/lora-dynamic-loading.rst b/docs/source/features/lora-dynamic-loading.rst
index 933be23e..49d3f52a 100644
--- a/docs/source/features/lora-dynamic-loading.rst
+++ b/docs/source/features/lora-dynamic-loading.rst
@@ -113,10 +113,6 @@ Send request using lora model name to the gateway.
 
 .. code-block:: bash
 
-    # Expose endpoint
-    LB_IP=$(kubectl get svc/envoy-aibrix-system-aibrix-eg-903790dc -n envoy-gateway-system -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
-    ENDPOINT="${LB_IP}:80"
-
     # send request to base model
     curl -v http://${ENDPOINT}/v1/completions \
         -H "Content-Type: application/json" \
@@ -168,7 +164,9 @@ User may pass in the argument ``--api-key`` or environment variable ``VLLM_API_K
 
 .. code-block:: bash
 
-    python3 -m vllm.entrypoints.openai.api_server --api-key test-key-1234567890
+    python3 -m vllm.entrypoints.openai.api_server --api-key sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake
+
+We already have an example and you can ``kubectl apply -f samples/adapter/adapter-with-key.yaml``.
 
 
 In that case, lora model adapter can not query the vLLM server correctly, showing ``{"error":"Unauthorized"}`` error. You need to update ``additionalConfig`` field to pass in the API key.
@@ -176,6 +174,23 @@ In that case, lora model adapter can not query the vLLM server correctly, showin
 .. literalinclude:: ../../../samples/adapter/adapter-api-key.yaml
    :language: yaml
 
+
+You need to send the request with ``--header 'Authorization: Bearer your-api-key'``
+
+.. code-block:: bash
+
+    # send request to base model
+    curl -v http://${ENDPOINT}/v1/completions \
+        -H "Content-Type: application/json" \
+        -H "Authorization: Bearer sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake" \
+        -d '{
+            "model": "qwen-code-lora-with-key",
+            "prompt": "San Francisco is a",
+            "max_tokens": 128,
+            "temperature": 0
+        }'
+
+
 Runtime Support Sidecar
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/samples/adapter/adapter-api-key.yaml b/samples/adapter/adapter-api-key.yaml
index b7cb2115..f85e083f 100644
--- a/samples/adapter/adapter-api-key.yaml
+++ b/samples/adapter/adapter-api-key.yaml
@@ -1,10 +1,10 @@
 apiVersion: model.aibrix.ai/v1alpha1
 kind: ModelAdapter
 metadata:
-  name: qwen-code-lora
+  name: qwen-code-lora-with-key
   namespace: default
   labels:
-    model.aibrix.ai/name: "qwen-code-lora"
+    model.aibrix.ai/name: "qwen-code-lora-with-key"
     model.aibrix.ai/port: "8000"
 spec:
   baseModel: qwen-coder-1-5b-instruct
@@ -13,5 +13,5 @@ spec:
       model.aibrix.ai/name: qwen-coder-1-5b-instruct
   artifactURL: huggingface://ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora
   additionalConfig:
-    api-key: test-key-1234567890
+    api-key: sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake
   schedulerName: default
diff --git a/samples/adapter/base-api-key.yaml b/samples/adapter/base-api-key.yaml
new file mode 100644
index 00000000..da1d7663
--- /dev/null
+++ b/samples/adapter/base-api-key.yaml
@@ -0,0 +1,103 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: qwen-coder-1-5b-instruct # Note: The label value `model.aibrix.ai/name` here must match with the service name.
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+  name: qwen-coder-1-5b-instruct
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: qwen-coder-1-5b-instruct
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: qwen-coder-1-5b-instruct
+    spec:
+      containers:
+        - command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model
+            - Qwen/Qwen2.5-Coder-1.5B-Instruct
+            - --served-model-name
+            # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name`
+            - qwen-coder-1-5b-instruct
+            - --api-key
+            - sk-kFJ12nKsFakefVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BFake
+            - --enable-lora
+          image: vllm/vllm-openai:v0.7.1
+          imagePullPolicy: Always
+          name: vllm-openai
+          env:
+            - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
+              value: "True"
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+        - name: aibrix-runtime
+          image: aibrix/runtime:v0.2.0
+          command:
+            - aibrix_runtime
+            - --port
+            - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: qwen-coder-1-5b-instruct
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: qwen-coder-1-5b-instruct # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: qwen-coder-1-5b-instruct
+  type: ClusterIP
diff --git a/samples/autoscaling/apa.yaml b/samples/autoscaling/apa.yaml
index de25aeab..56b1e1bc 100644
--- a/samples/autoscaling/apa.yaml
+++ b/samples/autoscaling/apa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: aibrix-model-deepseek-llm-7b-chat-apa
+  name: deepseek-r1-distill-llama-8b-apa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
@@ -24,4 +24,4 @@ spec:
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: aibrix-model-deepseek-llm-7b-chat
+    name: deepseek-r1-distill-llama-8b
diff --git a/samples/autoscaling/deploy.yaml b/samples/autoscaling/deploy.yaml
index 11cba442..dda75c4a 100644
--- a/samples/autoscaling/deploy.yaml
+++ b/samples/autoscaling/deploy.yaml
@@ -2,15 +2,15 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
   labels:
-    model.aibrix.ai/name: deepseek-llm-7b-chat
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
     model.aibrix.ai/port: "8000"
-  name: aibrix-model-deepseek-llm-7b-chat
+  name: deepseek-r1-distill-llama-8b
   namespace: default
 spec:
   replicas: 1
   selector:
     matchLabels:
-      model.aibrix.ai/name: deepseek-llm-7b-chat
+      model.aibrix.ai/name: deepseek-r1-distill-llama-8b
   strategy:
     type: Recreate
   template:
@@ -20,7 +20,7 @@ spec:
         prometheus.io/port: "8000"
         prometheus.io/path: "/metrics"
       labels:
-        model.aibrix.ai/name: deepseek-llm-7b-chat
+        model.aibrix.ai/name: deepseek-r1-distill-llama-8b
     spec:
       containers:
         - command:
@@ -32,12 +32,13 @@ spec:
             - --port
             - "8000"
             - --model
-            - deepseek-ai/deepseek-coder-6.7b-instruct
+            - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
             - --served-model-name
-            - deepseek-llm-7b-chat
-            - --trust-remote-code
+            - deepseek-r1-distill-llama-8b
             - --dtype
             - half
+            - --max-model-len
+            - "12288" # 24k length, this is to avoid "The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache" issue.
           image: vllm/vllm-openai:v0.7.1
           imagePullPolicy: IfNotPresent
           livenessProbe:
@@ -46,7 +47,7 @@ spec:
               path: /health
               port: 8000
               scheme: HTTP
-            initialDelaySeconds: 90
+            initialDelaySeconds: 120
             periodSeconds: 5
             successThreshold: 1
             timeoutSeconds: 1
@@ -55,12 +56,12 @@ spec:
             - containerPort: 8000
               protocol: TCP
           readinessProbe:
-            failureThreshold: 3
+            failureThreshold: 5
             httpGet:
               path: /health
               port: 8000
               scheme: HTTP
-            initialDelaySeconds: 90
+            initialDelaySeconds: 120
             periodSeconds: 5
             successThreshold: 1
             timeoutSeconds: 1
@@ -69,3 +70,30 @@ spec:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: deepseek-r1-distill-llama-8b # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  type: ClusterIP
diff --git a/samples/autoscaling/hpa.yaml b/samples/autoscaling/hpa.yaml
index f30ff206..abcde92f 100644
--- a/samples/autoscaling/hpa.yaml
+++ b/samples/autoscaling/hpa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: aibrix-model-deepseek-llm-7b-chat-hpa
+  name: deepseek-r1-distill-llama-8b-hpa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
@@ -20,4 +20,4 @@ spec:
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: aibrix-model-deepseek-llm-7b-chat
+    name: deepseek-r1-distill-llama-8b
diff --git a/samples/autoscaling/kpa.yaml b/samples/autoscaling/kpa.yaml
index 03831aae..01304391 100644
--- a/samples/autoscaling/kpa.yaml
+++ b/samples/autoscaling/kpa.yaml
@@ -1,7 +1,7 @@
 apiVersion: autoscaling.aibrix.ai/v1alpha1
 kind: PodAutoscaler
 metadata:
-  name: aibrix-model-deepseek-llm-7b-chat-kpa
+  name: deepseek-r1-distill-llama-8b-kpa
   namespace: default
   labels:
     app.kubernetes.io/name: aibrix
@@ -22,4 +22,4 @@ spec:
   scaleTargetRef:
     apiVersion: apps/v1
     kind: Deployment
-    name: aibrix-model-deepseek-llm-7b-chat
+    name: deepseek-r1-distill-llama-8b
diff --git a/samples/distributed/fleet-two-node.yaml b/samples/distributed/fleet-two-node.yaml
index 850eea4c..a08b9692 100644
--- a/samples/distributed/fleet-two-node.yaml
+++ b/samples/distributed/fleet-two-node.yaml
@@ -4,12 +4,12 @@ metadata:
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
-  name: facebook-opt-13b
+  name: qwen-coder-7b-instruct
 spec:
   replicas: 1
   selector:
     matchLabels:
-      model.aibrix.ai/name: facebook-opt-13b
+      model.aibrix.ai/name: qwen-coder-7b-instruct
   strategy:
     rollingUpdate:
       maxSurge: 25%
@@ -18,7 +18,7 @@ spec:
   template:
     metadata:
       labels:
-        model.aibrix.ai/name: facebook-opt-13b
+        model.aibrix.ai/name: qwen-coder-7b-instruct
       annotations:
         ray.io/overwrite-container-cmd: "true"
     spec:
@@ -41,7 +41,7 @@ spec:
                   - containerPort: 8000
                     name: service
                 command: ["/bin/bash", "-lc", "--"]
-                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray"]
+                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve Qwen/Qwen2.5-Coder-7B-Instruct --tensor-parallel-size 2 --distributed-executor-backend ray"]
                 resources:
                   limits:
                     cpu: "4"
diff --git a/samples/kvcache/deployment-tp.yaml b/samples/kvcache/deployment-tp.yaml
index 2a391999..10f51b01 100644
--- a/samples/kvcache/deployment-tp.yaml
+++ b/samples/kvcache/deployment-tp.yaml
@@ -57,7 +57,7 @@ spec:
             - name: AIBRIX_LLM_KV_CACHE_SOCKET
               value: /var/run/vineyard.sock
             - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT
-              value: "aibrix-model-deepseek-coder-33b-kvcache-rpc:9600"
+              value: "deepseek-coder-33b-kvcache-rpc:9600"
             - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE
               value: "1"
             - name: "VINEYARD_CACHE_METRICS_ENABLED"
@@ -73,4 +73,31 @@ spec:
       volumes:
         - name: kvcache-socket
           hostPath:
-            path: /var/run/vineyard-kubernetes/default/deepseek-coder-33b-kvcache
\ No newline at end of file
+            path: /var/run/vineyard-kubernetes/default/deepseek-coder-33b-kvcache
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-coder-33b-instruct
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: deepseek-coder-33b-instruct # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: deepseek-coder-33b-instruct
+  type: ClusterIP
diff --git a/samples/kvcache/deployment.yaml b/samples/kvcache/deployment.yaml
index ff752e61..c363cc55 100644
--- a/samples/kvcache/deployment.yaml
+++ b/samples/kvcache/deployment.yaml
@@ -35,7 +35,7 @@ spec:
             - --served-model-name
             - deepseek-coder-7b-instruct
             - --max-model-len
-            - "12288"
+            - "8192" # please modify this field if your gpu has more room
             - --enable-prefix-caching
             - --disable-fastapi-docs
           env:
@@ -52,7 +52,7 @@ spec:
             - name: AIBRIX_LLM_KV_CACHE_SOCKET
               value: /var/run/vineyard.sock
             - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT
-              value: "aibrix-model-deepseek-coder-7b-kvcache-rpc:9600"
+              value: "deepseek-coder-7b-kvcache-rpc:9600"
             - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE
               value: "1"
             - name: "VINEYARD_CACHE_METRICS_ENABLED"
@@ -69,3 +69,30 @@ spec:
         - name: kvcache-socket
           hostPath:
             path: /var/run/vineyard-kubernetes/default/deepseek-coder-7b-kvcache
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-coder-7b-instruct
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: deepseek-coder-7b-instruct # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: deepseek-coder-7b-instruct
+  type: ClusterIP
diff --git a/samples/kvcache/kvcache-tp.yaml b/samples/kvcache/kvcache-tp.yaml
index d69eb674..b5c4d278 100644
--- a/samples/kvcache/kvcache-tp.yaml
+++ b/samples/kvcache/kvcache-tp.yaml
@@ -14,3 +14,5 @@ spec:
   cacheSpec:
     image: aibrix/vineyardd:20241120
     imagePullPolicy: IfNotPresent
+    cpu: 2000m
+    memory: 4Gi
\ No newline at end of file
diff --git a/samples/kvcache/kvcache.yaml b/samples/kvcache/kvcache.yaml
index fb178195..a841f937 100644
--- a/samples/kvcache/kvcache.yaml
+++ b/samples/kvcache/kvcache.yaml
@@ -13,3 +13,5 @@ spec:
   cacheSpec:
     image: aibrix/vineyardd:20241120
     imagePullPolicy: IfNotPresent
+    cpu: 2000m
+    memory: 4Gi
diff --git a/samples/quickstart/model.yaml b/samples/quickstart/model.yaml
index 39eb5133..e5799782 100644
--- a/samples/quickstart/model.yaml
+++ b/samples/quickstart/model.yaml
@@ -33,7 +33,7 @@ spec:
             - --max-model-len
             - "12288" # 24k length, this is to avoid "The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache" issue.
           image: vllm/vllm-openai:v0.7.1
-          imagePullPolicy: Always
+          imagePullPolicy: IfNotPresent
           name: vllm-openai
           ports:
             - containerPort: 8000
@@ -43,6 +43,27 @@ spec:
               nvidia.com/gpu: "1"
             requests:
               nvidia.com/gpu: "1"
+          livenessProbe:
+            failureThreshold: 3
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 120
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+          readinessProbe:
+            failureThreshold: 5
+            httpGet:
+              path: /health
+              port: 8000
+              scheme: HTTP
+            initialDelaySeconds: 120
+            periodSeconds: 5
+            successThreshold: 1
+            timeoutSeconds: 1
+
 ---
 
 apiVersion: v1