From eaed16c421c8fefcf9afeade62d1544b96ef5126 Mon Sep 17 00:00:00 2001 From: Carlos Fernandez Musoles Date: Sat, 23 Nov 2024 22:56:57 +0000 Subject: [PATCH] templatised pool config --- assets/apps.yaml | 97 ++++++++++++++++++++------------ assets/apps_values.yaml | 30 ++++++++++ assets/pool_config_template.yaml | 27 +++++++++ assets/pool_config_values.yaml | 26 +++++++++ 4 files changed, 145 insertions(+), 35 deletions(-) create mode 100644 assets/apps_values.yaml create mode 100644 assets/pool_config_template.yaml create mode 100644 assets/pool_config_values.yaml diff --git a/assets/apps.yaml b/assets/apps.yaml index ce69f7f..2a296db 100644 --- a/assets/apps.yaml +++ b/assets/apps.yaml @@ -10,8 +10,47 @@ repositories: # url: https://project-hami.github.io/HAMi/ - name: longhorn url: https://charts.longhorn.io + - name: volcano-sh + url: https://volcano-sh.github.io/helm-charts + - name: prometheus + url: https://prometheus-community.github.io/helm-charts + - name: opencost-charts + url: https://opencost.github.io/opencost-helm-chart releases: + - name: helios + namespace: kalavai + chart: kalavai/kalavai-helios + version: "0.1.8" + installed: {{is_public_pool}} + set: + - name: deployment.watcher_endpoint + value: "http://{{watcher_service}}" + - name: deployment.watcher_auth_key + value: "{{watcher_readonly_key}}" + - name: deployment.kalavai_api_endpoint + value: {{kalavai_api_endpoint}} + - name: deployment.user_node_label + value: "{{user_node_label}}" + - name: deployment.sleep_interval + value: {{helios_harvest_interval}} + - name: opencost + namespace: opencost + chart: opencost-charts/opencost + installed: {{is_public_pool}} + - name: prometheus + namespace: prometheus-system + chart: prometheus/prometheus + installed: {{is_public_pool}} + set: + - name: prometheus-pushgateway.enabled + value: false + - name: alertmanager.enabled + value: false + - name: volcano-sh + namespace: kalavai + chart: volcano-sh/volcano + installed: true - name: kuberay namespace: kuberay chart: kuberay/kuberay-operator @@ -25,24 +64,24 @@ releases: namespace: kalavai chart: longhorn/longhorn installed: true - # set: - # # security issue! enable for testing only - # - name: service.ui.type - # value: NodePort - # - name: service.ui.nodePort - # value: $longhorn_ui_port - # - name: service.manager.type - # value: NodePort - # - name: service.manager.nodePort - # value: $longhorn_manager_port + set: + # security issue! enable for testing only + - name: service.ui.type + value: NodePort + - name: service.ui.nodePort + value: {{longhorn_ui_port}} + - name: service.manager.type + value: NodePort + - name: service.manager.nodePort + value: {{longhorn_manager_port}} - name: lws namespace: kalavai chart: kalavai/lws - installed: true + installed: false - name: kalavai-watcher namespace: kalavai chart: kalavai/kalavai-watcher - version: "0.2.8" + version: "0.2.15" installed: true set: - name: namespace @@ -54,37 +93,25 @@ releases: - name: deployment.use_auth_key value: "True" - name: deployment.admin_key - value: "$watcher_admin_key" + value: "{{watcher_admin_key}}" - name: deployment.write_key - value: "$watcher_write_key" + value: "{{watcher_write_key}}" - name: deployment.readonly_key - value: "$watcher_readonly_key" + value: "{{watcher_readonly_key}}" - name: deployment.is_public_pool - value: $is_public_pool + value: {{is_public_pool}} - name: deployment.kalavai_api_endpoint - value: $kalavai_api_endpoint + value: {{kalavai_api_endpoint}} + - name: deployment.prometheus_endpoint + value: {{prometheus_endpoint}} + - name: deployment.opencost_endpoint + value: {{opencost_endpoint}} - name: service.nodePort - value: $watcher_port + value: {{watcher_port}} - name: nvidia-gpu-operator namespace: kalavai chart: kalavai/gpu - installed: true - - name: helios - namespace: kalavai - chart: kalavai/kalavai-helios - installed: $deploy_helios - set: - - name: deployment.watcher_endpoint - value: "http://$watcher_service" - - name: deployment.watcher_auth_key - value: "$watcher_readonly_key" - - name: deployment.kalavai_api_endpoint - value: "https://platform.kalavai.net/_/api" - - name: deployment.user_node_label - value: "$user_node_label" - - name: deployment.sleep_interval - value: 600 - + installed: true - name: hami-vgpu namespace: kalavai chart: kalavai/hami diff --git a/assets/apps_values.yaml b/assets/apps_values.yaml new file mode 100644 index 0000000..82028fd --- /dev/null +++ b/assets/apps_values.yaml @@ -0,0 +1,30 @@ +# TODO: add helm versions here + +- name: kalavai_api_endpoint + default: https://platform.kalavai.net/_/api + description: "" + +- name: prometheus_endpoint + default: http://prometheus-server.prometheus-system.svc.cluster.local:80 + description: "" + +- name: opencost_endpoint + default: http://opencost.opencost.svc.cluster.local:9003 + description: "" + +- name: helios_harvest_interval + default: 60 + description: "Interval (minutes) at which to report resource usage in public pools" + +- name: longhorn_ui_port + default: "" #31010 + description: "" + +- name: longhorn_manager_port + default: "" #31011 + description: "" + +# - name: is_public_pool +# default: false +# description: "Interval (minutes) at which to report resource usage in public pools" + diff --git a/assets/pool_config_template.yaml b/assets/pool_config_template.yaml new file mode 100644 index 0000000..f946e3e --- /dev/null +++ b/assets/pool_config_template.yaml @@ -0,0 +1,27 @@ +kind: StorageClass +apiVersion: storage.k8s.io/v1 +metadata: + name: longhorn-rwx +provisioner: driver.longhorn.io +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +parameters: + shareManagerNodeSelector: {{sc_label_selector}} + numberOfReplicas: "{{sc_replicas}}" + staleReplicaTimeout: "2880" + fromBackup: "" + fsType: "ext4" +# --- +# apiVersion: scheduling.volcano.sh/v1beta1 +# kind: Queue +# metadata: +# name: default-queue +# spec: +# weight: 1 +# reclaimable: {{queue_reclaimable}} +# capability: +# cpu: {{queue_max_cpus}} +# memory: {{queue_max_memory}}Gi +# nvidia.com/gpu: {{queue_max_gpus}} + diff --git a/assets/pool_config_values.yaml b/assets/pool_config_values.yaml new file mode 100644 index 0000000..f102087 --- /dev/null +++ b/assets/pool_config_values.yaml @@ -0,0 +1,26 @@ +# STORAGE # +- name: sc_label_selector + default: "kalavai.storage.enabled:True" + description: "" + +- name: sc_replicas + default: 3 + description: "" + +# QUEUE # +# - name: queue_reclaimable +# default: "true" +# description: "" + +# - name: queue_max_cpus +# default: 1000000 +# description: "" + +# - name: queue_max_gpus +# default: 1000000 +# description: "" + +# - name: queue_max_memory +# default: 1000000 +# description: "" +