diff --git a/assets/storage_class_template.yaml b/assets/storage_class_template.yaml deleted file mode 100644 index 3b8bb85..0000000 --- a/assets/storage_class_template.yaml +++ /dev/null @@ -1,17 +0,0 @@ -kind: StorageClass -apiVersion: storage.k8s.io/v1 -metadata: - name: $sc_name -provisioner: driver.longhorn.io -allowVolumeExpansion: true -reclaimPolicy: Delete -volumeBindingMode: WaitForFirstConsumer -parameters: - shareManagerNodeSelector: $sc_label_selector - numberOfReplicas: "$sc_replicas" - staleReplicaTimeout: "2880" - fromBackup: "" - fsType: "ext4" - #nfsOptions: "vers=4.2,noresvport,softerr,timeo=600,retrans=5" - #nfsOptions: "hard,timeo=50,retrans=1" - diff --git a/kalavai_client/cli.py b/kalavai_client/cli.py index a267b41..d92262a 100644 --- a/kalavai_client/cli.py +++ b/kalavai_client/cli.py @@ -3,7 +3,6 @@ import os import json import uuid -from string import Template import time import socket from pathlib import Path @@ -54,10 +53,6 @@ WATCHER_PORT_KEY, MANDATORY_TOKEN_FIELDS, USER_NODE_LABEL_KEY, - DEPLOY_HELIOS_KEY, - LONGHORN_UI_PORT_KEY, - LONGHORN_MANAGER_PORT_KEY, - KALAVAI_API_ENDPOINT_KEY, IS_PUBLIC_POOL_KEY ) from kalavai_client.cluster import ( @@ -66,26 +61,26 @@ KALAVAI_PLATFORM_URL = os.getenv("KALAVAI_PLATFORM_URL", "https://platform.kalavai.net") -KALAVAI_API_ENDPOINT = os.getenv("KALAVAI_API_ENDPOINT", "https://platform.kalavai.net/_/api") LOCAL_TEMPLATES_DIR = os.getenv("LOCAL_TEMPLATES_DIR", None) VERSION = 1 RESOURCE_EXCLUDE = ["ephemeral-storage", "hugepages-1Gi", "hugepages-2Mi", "pods"] CORE_NAMESPACES = ["lws-system", "kube-system", "gpu-operator", "kalavai"] -TEMPLATE_LABEL = "kalavai.lws.name" +TEMPLATE_LABEL = "kalavai.job.name" RAY_LABEL = "kalavai.ray.name" PVC_NAME_LABEL = "kalavai.storage.name" +POOL_CONFIG_TEMPLATE = resource_path("assets/pool_config_template.yaml") +POOL_CONFIG_DEFAULT_VALUES = resource_path("assets/pool_config_values.yaml") STORAGE_CLASS_NAME = "longhorn-rwx" STORAGE_CLASS_LABEL = "kalavai.storage.enabled" DEFAULT_STORAGE_NAME = "pool-cache" DEFAULT_STORAGE_SIZE = 5 -DEFAULT_STORAGE_REPLICAS = 1 USER_NODE_LABEL = "kalavai.cluster.user" KUBE_VERSION = os.getenv("KALAVAI_KUBE_VERSION", "v1.31.1+k3s1") DEFAULT_FLANNEL_IFACE = os.getenv("KALAVAI_FLANNEL_IFACE", "netmaker") FORBIDEDEN_IPS = ["127.0.0.1"] # kalavai templates HELM_APPS_FILE = resource_path("assets/apps.yaml") -STORAGE_CLASS_TEMPLATE_FILE = resource_path("assets/storage_class_template.yaml") +HELM_APPS_VALUES = resource_path("assets/apps_values.yaml") # user specific config files USER_HELM_APPS_FILE = user_path("apps.yaml") USER_KUBECONFIG_FILE = user_path("kubeconfig") @@ -192,6 +187,30 @@ def init_user_workspace(): except Exception as e: console.log(f"[red]Error when connecting to kalavai service: {str(e)}") +def pool_init(): + """Deploy configured objects to initialise pool""" + # load template config and populate with values + sidecar_template_yaml = load_template( + template_path=POOL_CONFIG_TEMPLATE, + values={}, + default_values_path=POOL_CONFIG_DEFAULT_VALUES) + + try: + result = request_to_server( + method="post", + endpoint="/v1/deploy_generic_model", + data={"config": sidecar_template_yaml}, + server_creds=USER_LOCAL_SERVER_FILE, + user_cookie=USER_COOKIE + ) + if len(result['failed']) > 0: + console.log(f"[red]Error when deploying pool config\n\n{result['failed']}") + if len(result['successful']) > 0: + console.log(f"[green]Deployed pool config!") + except Exception as e: + console.log(f"[red]Error when connecting to kalavai service: {str(e)}") + + def select_ip_address(subnet=None): ips = [] for iface in ni.interfaces(): @@ -414,7 +433,7 @@ def pool__list(*others, user_only=False): console.log("[white]Use [yellow]kalavai pool join [white]to join a public pool") @arguably.command -def pool__start(cluster_name, *others, ip_address: str=None, location: str=None): +def pool__start(cluster_name, *others, ip_address: str=None, location: str=None, app_values: str=HELM_APPS_VALUES): """ Start Kalavai pool and start/resume sharing resources. @@ -455,8 +474,6 @@ def pool__start(cluster_name, *others, ip_address: str=None, location: str=None write_auth_key = str(uuid.uuid4()) readonly_auth_key = str(uuid.uuid4()) watcher_port = 31000 - longhorn_ui_port = 30000 - longhorn_manager_port = 30001 watcher_service = f"{ip_address}:{watcher_port}" values = { CLUSTER_NAME_KEY: cluster_name, @@ -465,13 +482,9 @@ def pool__start(cluster_name, *others, ip_address: str=None, location: str=None READONLY_AUTH_KEY: readonly_auth_key, WRITE_AUTH_KEY: write_auth_key, WATCHER_PORT_KEY: watcher_port, - LONGHORN_UI_PORT_KEY: longhorn_ui_port, - LONGHORN_MANAGER_PORT_KEY: longhorn_manager_port, WATCHER_SERVICE_KEY: watcher_service, USER_NODE_LABEL_KEY: USER_NODE_LABEL, - DEPLOY_HELIOS_KEY: location is not None, - IS_PUBLIC_POOL_KEY: location is not None, - KALAVAI_API_ENDPOINT_KEY: KALAVAI_API_ENDPOINT + IS_PUBLIC_POOL_KEY: location is not None } # 1. start k3s server @@ -499,12 +512,14 @@ def pool__start(cluster_name, *others, ip_address: str=None, location: str=None console.log("Install dependencies...") # set template values in helmfile - with open(HELM_APPS_FILE, "r") as f: - config = Template(f.read()) - config = config.substitute(values) + helm_yaml = load_template( + template_path=HELM_APPS_FILE, + values=values, + default_values_path=app_values, + force_defaults=True) with open(USER_HELM_APPS_FILE, "w") as f: - f.write(config) + f.write(helm_yaml) CLUSTER.update_dependencies( dependencies_file=USER_HELM_APPS_FILE ) @@ -526,8 +541,8 @@ def pool__start(cluster_name, *others, ip_address: str=None, location: str=None break console.log("Initialise user workspace...") init_user_workspace() - console.log(f"Initialising storage: {DEFAULT_STORAGE_NAME} ({DEFAULT_STORAGE_SIZE}Gi)...") - storage__init() + console.log(f"Initialising pool config...") + pool_init() storage__create() return None @@ -913,40 +928,6 @@ def pool__status(*others, log_file=None): for log in logs: console.log(f"{log}\n") -@arguably.command -def storage__init(replicas=DEFAULT_STORAGE_REPLICAS, *others): - """ - Create storage for the cluster - """ - try: - CLUSTER.validate_cluster() - except Exception as e: - console.log(f"[red]Problems with your pool: {str(e)}") - return - - sidecar_template_yaml = load_template( - template_path=STORAGE_CLASS_TEMPLATE_FILE, - values={ - "sc_name": STORAGE_CLASS_NAME, - "sc_label_selector": f"{STORAGE_CLASS_LABEL}:True", - "sc_replicas": replicas - } - ) - try: - result = request_to_server( - method="post", - endpoint="/v1/deploy_generic_model", - data={"config": sidecar_template_yaml}, - server_creds=USER_LOCAL_SERVER_FILE, - user_cookie=USER_COOKIE - ) - if len(result['failed']) > 0: - console.log(f"[red]Error when creating storage class\n\n{result['failed']}") - if len(result['successful']) > 0: - console.log(f"[green]Created storage class: {STORAGE_CLASS_NAME} ({replicas} replicas)") - except Exception as e: - console.log(f"[red]Error when connecting to kalavai service: {str(e)}") - @arguably.command def storage__create(name=DEFAULT_STORAGE_NAME, storage=DEFAULT_STORAGE_SIZE, *others): """ @@ -1230,9 +1211,12 @@ def generate_gpu_annotation(input_message, values, value_key, annotation_key): # deploy template with kube-watcher data = { "object": { - "group": "leaderworkerset.x-k8s.io", - "api_version": "v1", - "plural": "leaderworkersets" + "group": "batch.volcano.sh", + "api_version": "v1alpha1", + "plural": "jobs" + # "group": "leaderworkerset.x-k8s.io", + # "api_version": "v1", + # "plural": "leaderworkersets" }, "body": template_yaml } @@ -1366,9 +1350,12 @@ def job__list(*others): return data = { - "group": "leaderworkerset.x-k8s.io", - "api_version": "v1", - "plural": "leaderworkersets", + "group": "batch.volcano.sh", + "api_version": "v1alpha1", + "plural": "jobs" + # "group": "leaderworkerset.x-k8s.io", + # "api_version": "v1", + # "plural": "leaderworkersets", } try: result = request_to_server( @@ -1378,7 +1365,7 @@ def job__list(*others): server_creds=USER_LOCAL_SERVER_FILE, user_cookie=USER_COOKIE ) - deployment_names = [d["metadata"]["name"] for d in result["items"]] + deployment_names = [d["metadata"]["labels"][TEMPLATE_LABEL] for d in result["items"]] except Exception as e: console.log(f"[red]Error when connecting to kalavai service: {str(e)}") @@ -1394,9 +1381,12 @@ def job__list(*others): try: # get status for deployment data = { - "group": "leaderworkerset.x-k8s.io", - "api_version": "v1", - "plural": "leaderworkersets", + "group": "batch.volcano.sh", + "api_version": "v1alpha1", + "plural": "jobs", + # "group": "leaderworkerset.x-k8s.io", + # "api_version": "v1", + # "plural": "leaderworkersets", "name": deployment } result = request_to_server( @@ -1408,14 +1398,15 @@ def job__list(*others): ) if len(result) > 0: last = result[-1] - statuses = f"{last['type']}: {last['message']}" + statuses = f"[{last['lastTransitionTime']}] {last['status']}" else: statuses = "Unknown" # get pod statuses data = { - "label": "leaderworkerset.sigs.k8s.io/name", + "label": TEMPLATE_LABEL, "value": deployment } + # TODO result = request_to_server( method="post", endpoint="/v1/get_pods_status_for_label", @@ -1458,7 +1449,7 @@ def job__list(*others): @arguably.command -def job__logs(name, *others, pod_name=None, stream=False): +def job__logs(name, *others, pod_name=None, stream=False, tail=100): """ Get logs for a specific job """ @@ -1469,11 +1460,12 @@ def job__logs(name, *others, pod_name=None, stream=False): return data = { - "label": "leaderworkerset.sigs.k8s.io/name", + "label": TEMPLATE_LABEL, "value": name } while True: try: + # send tail as parameter (fetch only last _tail_ lines) result = request_to_server( method="post", endpoint="/v1/get_logs_for_label", @@ -1515,7 +1507,7 @@ def job__manifest(*others, name): return data = { - "label": "leaderworkerset.sigs.k8s.io/name", + "label": TEMPLATE_LABEL, "value": name } try: diff --git a/kalavai_client/utils.py b/kalavai_client/utils.py index 143d77c..677b2e1 100644 --- a/kalavai_client/utils.py +++ b/kalavai_client/utils.py @@ -36,11 +36,7 @@ READONLY_AUTH_KEY = "watcher_readonly_key" WATCHER_SERVICE_KEY = "watcher_service" WATCHER_PORT_KEY = "watcher_port" -LONGHORN_UI_PORT_KEY = "longhorn_ui_port" -LONGHORN_MANAGER_PORT_KEY = "longhorn_manager_port" -DEPLOY_HELIOS_KEY = "deploy_helios" IS_PUBLIC_POOL_KEY = "is_public_pool" -KALAVAI_API_ENDPOINT_KEY = "kalavai_api_endpoint" MANDATORY_TOKEN_FIELDS = [ CLUSTER_IP_KEY, CLUSTER_TOKEN_KEY, @@ -369,7 +365,10 @@ def store_server_info(server_ip, auth_key, watcher_service, file, node_name, clu }, f) return True -def load_template(template_path, values, default_values_path=None): +def populate_template(template_str, values_dict): + return Template(template_str).render(values_dict) + +def load_template(template_path, values, default_values_path=None, force_defaults=False): if not Path(template_path).exists(): raise FileNotFoundError(f"{template_path} does not exist") @@ -381,12 +380,10 @@ def load_template(template_path, values, default_values_path=None): with open(default_values_path, 'r') as f: default_values = yaml.safe_load(f) for default in default_values: - if default["name"] not in values: + if not force_defaults or default["name"] not in values: values[default['name']] = default['default'] - template = Template(yaml_template) - - return template.render(values) + return populate_template(template_str=yaml_template, values_dict=values) def user_confirm(question: str, options: list, multiple: bool=False) -> int: diff --git a/templates/aphrodite/examples/qwen2.5-0.5B.yaml b/templates/aphrodite/examples/qwen2.5-0.5B.yaml index 9ac9712..ff2e07a 100644 --- a/templates/aphrodite/examples/qwen2.5-0.5B.yaml +++ b/templates/aphrodite/examples/qwen2.5-0.5B.yaml @@ -8,10 +8,10 @@ default: "pool-cache" description: "Pool storage to use to cache model weights" -- name: num_workers - value: "1" - default: "1" - description: "Workers per deployment (for tensor parallelism)" +- name: remote_workers + value: "0" + default: "0" + description: "Number of remote workers (for tensor and pipeline parallelism). This is in addition to the main node" - name: repo_id value: Qwen/Qwen2.5-0.5B-Instruct diff --git a/templates/aphrodite/template.yaml b/templates/aphrodite/template.yaml index 4d9b89e..2363676 100644 --- a/templates/aphrodite/template.yaml +++ b/templates/aphrodite/template.yaml @@ -1,16 +1,24 @@ -apiVersion: leaderworkerset.x-k8s.io/v1 -kind: LeaderWorkerSet +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job metadata: name: {{deployment_name}} labels: # must have this label - kalavai.lws.name: {{deployment_name}} + kalavai.job.name: {{deployment_name}} spec: - replicas: {{replicas}} - leaderWorkerTemplate: - size: {{num_workers}} - restartPolicy: RecreateGroupOnPodRestart - leaderTemplate: + queue: {{queue_name}} + #minAvailable: 2 + schedulerName: volcano + plugins: + env: [] + svc: [] + policies: + - event: PodEvicted # Restart the job when a pod is evicted. + action: RestartJob + tasks: + - replicas: 1 # One ps pod specified + name: ps + template: # Definition of the ps pod metadata: annotations: # must have these annotations @@ -18,12 +26,28 @@ spec: {{use_gputype}} labels: role: leader - kalavai.lws.name: {{deployment_name}} + kalavai.job.name: {{deployment_name}} spec: runtimeClassName: nvidia containers: - - name: aphrodite-leader + - command: + - sh + - -c + - | + RAY_BACKEND_LOG_LEVEL=error /home/ray/workspace/ray_init.sh leader --ray_cluster_size=$(({{remote_workers}}+1)) --ray_port=6379 --ray_object_store_memory={{shmem_size}}; + sleep 30; + nvidia-smi; + ray status; + /home/ray/workspace/run_model.sh \ + --repo_id={{repo_id}} \ + --model_filename={{model_filename}} \ + --extra='{{extra}}' \ + --tensor_parallel_size={{tensor_parallel_size}} \ + --pipeline_parallel_size={{pipeline_parallel_size}} \ + --local_dir=/home/ray/cache; + sleep 30 image: docker.io/bundenth/ray-aphrodite:v1.0.11 + name: aphrodite env: - name: HF_TOKEN value: {{hf_token}} @@ -31,40 +55,23 @@ spec: value: /home/ray/cache - name: TMPDIR value: /home/ray/cache/tmp - command: - - sh - - -c - - "/home/ray/workspace/ray_init.sh leader --ray_cluster_size={{num_workers}} --ray_object_store_memory={{shmem_size}}; - sleep 30; - nvidia-smi; - ray status; - /home/ray/workspace/run_model.sh \ - --repo_id={{repo_id}} \ - --model_filename={{model_filename}} \ - --extra='{{extra}}' \ - --tensor_parallel_size={{tensor_parallel_size}} \ - --pipeline_parallel_size={{pipeline_parallel_size}} \ - --local_dir=/home/ray/cache; - sleep 30" - resources: - requests: - cpu: "{{cpus}}" - memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" - nvidia.com/gpucores: 100 - limits: - cpu: "{{cpus}}" - memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" - nvidia.com/gpucores: 100 ports: - # if use 8080 as exposed port (if required) - containerPort: 8080 + name: model-port readinessProbe: tcpSocket: port: 8080 initialDelaySeconds: 90 periodSeconds: 30 + resources: + requests: + cpu: {{cpus}} + memory: {{memory}}Gi + nvidia.com/gpu: {{gpus}} + limits: + cpu: {{cpus}} + memory: {{memory}}Gi + nvidia.com/gpu: {{gpus}} volumeMounts: - mountPath: /dev/shm name: dshm @@ -77,17 +84,32 @@ spec: sizeLimit: {{shmem_size}} - name: cache persistentVolumeClaim: - claimName: {{storage}} - workerTemplate: + claimName: pool-cache + restartPolicy: Never + - replicas: {{remote_workers}} + name: worker + policies: + - event: TaskCompleted # The job will be marked as completed when two worker pods finish tasks. + action: CompleteJob + template: # Definition of worker pods metadata: annotations: # must have these annotations {{nouse_gputype}} {{use_gputype}} + labels: + kalavai.job.name: {{deployment_name}} spec: runtimeClassName: nvidia containers: - - name: aphrodite-worker + - command: + - sh + - -c + - | + PS_HOST=`head /etc/volcano/ps.host`; + WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&/g' | sed 's/^/"/;s/$/"/' | tr "\n" ","`; + nvidia-smi; + RAY_BACKEND_LOG_LEVEL=error /home/ray/workspace/ray_init.sh worker --ray_address=$PS_HOST --ray_port=6379 --ray_object_store_memory={{shmem_size}} --ray_block=1 image: docker.io/bundenth/ray-aphrodite:v1.0.11 env: - name: HF_TOKEN @@ -96,22 +118,19 @@ spec: value: /home/ray/cache - name: TMPDIR value: /home/ray/cache/tmp - command: - - sh - - -c - - "nvidia-smi; - /home/ray/workspace/ray_init.sh worker --ray_address=$LWS_LEADER_ADDRESS --ray_object_store_memory={{shmem_size}} --ray_block=1" + name: aphrodite + # ports: + # - containerPort: 2222 + # name: ray-port resources: requests: - cpu: "{{cpus}}" + cpu: {{cpus}} memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" - nvidia.com/gpucores: 100 + nvidia.com/gpu: {{gpus}} limits: - cpu: "{{cpus}}" + cpu: {{cpus}} memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" - nvidia.com/gpucores: 100 + nvidia.com/gpu: {{gpus}} volumeMounts: - mountPath: /dev/shm name: dshm @@ -119,4 +138,5 @@ spec: - name: dshm emptyDir: medium: Memory - sizeLimit: {{shmem_size}} \ No newline at end of file + sizeLimit: {{shmem_size}} + restartPolicy: Never diff --git a/templates/aphrodite/values.yaml b/templates/aphrodite/values.yaml index 6c29c6e..fe83e1a 100644 --- a/templates/aphrodite/values.yaml +++ b/templates/aphrodite/values.yaml @@ -8,15 +8,15 @@ default: "pool-cache" description: "Pool storage to use to cache model weights" -- name: replicas - value: "1" - default: "1" - description: "How many replicas to deploy for the model" +- name: queue_name + value: "default" + default: "default" + description: "Name of the kalavai queue to use" -- name: num_workers - value: "1" - default: "1" - description: "Workers per deployment (for tensor parallelism)" +- name: remote_workers + value: "0" + default: "0" + description: "Number of remote workers (for tensor and pipeline parallelism). This is in addition to the main node" - name: repo_id value: null diff --git a/templates/dummy/template.yaml b/templates/dummy/template.yaml index 649ff4a..3cb8186 100644 --- a/templates/dummy/template.yaml +++ b/templates/dummy/template.yaml @@ -1,18 +1,24 @@ -# Specs and examples: https://github.com/kubernetes-sigs/lws/blob/main/docs/examples/sample/README.md -apiVersion: leaderworkerset.x-k8s.io/v1 -kind: LeaderWorkerSet +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job metadata: name: {{deployment_name}} labels: # must have this label - kalavai.lws.name: {{deployment_name}} + kalavai.job.name: {{deployment_name}} spec: - # number of copies for this deployment - replicas: 1 - leaderWorkerTemplate: - # how many workers (distributed nodes) - size: 2 - workerTemplate: + queue: default + #minAvailable: 2 + schedulerName: volcano + plugins: + env: [] + svc: [] + policies: + - event: PodEvicted # Restart the job when a pod is evicted. + action: RestartJob + tasks: + - replicas: 1 # One ps pod specified + name: ps + template: # Definition of the ps pod spec: containers: - name: nginx @@ -30,4 +36,5 @@ spec: nvidia.com/gpu: "1" ports: # what port to make available - - containerPort: 8080 \ No newline at end of file + - containerPort: 8080 + restartPolicy: Never \ No newline at end of file diff --git a/templates/vllm/template.yaml b/templates/vllm/template.yaml index a612493..08d8334 100644 --- a/templates/vllm/template.yaml +++ b/templates/vllm/template.yaml @@ -1,16 +1,23 @@ -apiVersion: leaderworkerset.x-k8s.io/v1 -kind: LeaderWorkerSet +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job metadata: name: {{deployment_name}} labels: # must have this label - kalavai.lws.name: {{deployment_name}} + kalavai.job.name: {{deployment_name}} spec: - replicas: {{replicas}} - leaderWorkerTemplate: - size: {{num_workers}} - restartPolicy: RecreateGroupOnPodRestart - leaderTemplate: + queue: default + schedulerName: volcano + plugins: + env: [] + svc: [] + policies: + - event: PodEvicted # Restart the job when a pod is evicted. + action: RestartJob + tasks: + - replicas: 1 # One ps pod specified + name: ps + template: # Definition of the ps pod metadata: annotations: # must have these annotations @@ -18,11 +25,25 @@ spec: {{use_gputype}} labels: role: leader - kalavai.lws.name: {{deployment_name}} + kalavai.job.name: {{deployment_name}} spec: runtimeClassName: nvidia containers: - - name: vllm-leader + - command: + - sh + - -c + - | + RAY_BACKEND_LOG_LEVEL=error /home/ray/workspace/ray_init.sh leader --ray_cluster_size=$(({{remote_workers}}+1)) --ray_object_store_memory={{shmem_size}}; + sleep 30; + nvidia-smi; + ray status; + /home/ray/workspace/run_model.sh \ + --model_id={{model_id}} \ + --extra='{{extra}}' \ + --tensor_parallel_size={{tensor_parallel_size}} \ + --pipeline_parallel_size={{pipeline_parallel_size}}; + sleep 30 + name: vllm-leader image: docker.io/bundenth/ray-vllm:v1.1.4 env: - name: HF_TOKEN @@ -31,36 +52,23 @@ spec: value: /home/ray/cache - name: TMPDIR value: /home/ray/cache/tmp - command: - - sh - - -c - - "/home/ray/workspace/ray_init.sh leader --ray_cluster_size={{num_workers}} --ray_object_store_memory={{shmem_size}}; - sleep 30; - nvidia-smi; - ray status; - /home/ray/workspace/run_model.sh \ - --model_id={{model_id}} \ - --extra='{{extra}}' \ - --tensor_parallel_size={{tensor_parallel_size}} \ - --pipeline_parallel_size={{pipeline_parallel_size}}; - sleep 30" - resources: - requests: - cpu: "{{cpus}}" - memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" - limits: - cpu: "{{cpus}}" - memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" ports: - # if use 8080 as exposed port (if required) - containerPort: 8080 + name: model-port readinessProbe: tcpSocket: port: 8080 initialDelaySeconds: 90 periodSeconds: 30 + resources: + requests: + cpu: {{cpus}} + memory: {{memory}}Gi + nvidia.com/gpu: {{gpus}} + limits: + cpu: {{cpus}} + memory: {{memory}}Gi + nvidia.com/gpu: {{gpus}} volumeMounts: - mountPath: /dev/shm name: dshm @@ -73,18 +81,34 @@ spec: sizeLimit: {{shmem_size}} - name: cache persistentVolumeClaim: - claimName: {{storage}} - workerTemplate: + claimName: pool-cache + restartPolicy: Never + - replicas: {{remote_workers}} + name: worker + policies: + - event: TaskCompleted # The job will be marked as completed when two worker pods finish tasks. + action: CompleteJob + template: # Definition of worker pods metadata: annotations: # must have these annotations {{nouse_gputype}} {{use_gputype}} + labels: + kalavai.job.name: {{deployment_name}} spec: runtimeClassName: nvidia containers: - name: vllm-worker image: docker.io/bundenth/ray-vllm:v1.1.4 + command: + - sh + - -c + - | + PS_HOST=`head /etc/volcano/ps.host`; + WORKER_HOST=`cat /etc/volcano/worker.host | sed 's/$/&/g' | sed 's/^/"/;s/$/"/' | tr "\n" ","`; + nvidia-smi; + RAY_BACKEND_LOG_LEVEL=error /home/ray/workspace/ray_init.sh worker --ray_address=$PS_HOST --ray_port=6379 --ray_object_store_memory={{shmem_size}} --ray_block=1 env: - name: HF_TOKEN value: {{hf_token}} @@ -92,20 +116,15 @@ spec: value: /home/ray/cache - name: TMPDIR value: /home/ray/cache/tmp - command: - - sh - - -c - - "nvidia-smi; - RAY_BACKEND_LOG_LEVEL=error /home/ray/workspace/ray_init.sh worker --ray_address=$LWS_LEADER_ADDRESS --ray_object_store_memory={{shmem_size}} --ray_block=1" resources: requests: - cpu: "{{cpus}}" + cpu: {{cpus}} memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" + nvidia.com/gpu: {{gpus}} limits: - cpu: "{{cpus}}" + cpu: {{cpus}} memory: {{memory}}Gi - nvidia.com/gpu: "{{gpus}}" + nvidia.com/gpu: {{gpus}} volumeMounts: - mountPath: /dev/shm name: dshm @@ -113,4 +132,5 @@ spec: - name: dshm emptyDir: medium: Memory - sizeLimit: {{shmem_size}} \ No newline at end of file + sizeLimit: {{shmem_size}} + restartPolicy: Never diff --git a/templates/vllm/values.yaml b/templates/vllm/values.yaml index 4e45e37..dbd1bd4 100644 --- a/templates/vllm/values.yaml +++ b/templates/vllm/values.yaml @@ -8,15 +8,10 @@ default: "pool-cache" description: "Pool storage to use to cache model weights" -- name: replicas - value: "1" - default: "1" - description: "How many replicas to deploy for the model" - -- name: num_workers - value: "1" - default: "1" - description: "Workers per deployment (for tensor parallelism)" +- name: remote_workers + value: "0" + default: "0" + description: "Number of remote workers (for tensor and pipeline parallelism). This is in addition to the main node" - name: model_id value: null @@ -38,14 +33,9 @@ default: "1" description: "GPUs per single worker (final one = gpus * num_workers)" -- name: gpu_vram - value: "4000" - default: "4000" - description: "vRAM per GPU (total one = num_workers * gpus * gpu_vram)" - - name: memory - value: "4Gi" - default: "4Gi" + value: "4" + default: "4" description: "RAM memory per single worker (final one = memory * num_workers)" - name: tensor_parallel_size