Skip to content

Commit

Permalink
templatised pool config
Browse files Browse the repository at this point in the history
  • Loading branch information
musoles committed Nov 23, 2024
1 parent 5b5a82d commit eaed16c
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 35 deletions.
97 changes: 62 additions & 35 deletions assets/apps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,47 @@ repositories:
# url: https://project-hami.github.io/HAMi/
- name: longhorn
url: https://charts.longhorn.io
- name: volcano-sh
url: https://volcano-sh.github.io/helm-charts
- name: prometheus
url: https://prometheus-community.github.io/helm-charts
- name: opencost-charts
url: https://opencost.github.io/opencost-helm-chart

releases:
- name: helios
namespace: kalavai
chart: kalavai/kalavai-helios
version: "0.1.8"
installed: {{is_public_pool}}
set:
- name: deployment.watcher_endpoint
value: "http://{{watcher_service}}"
- name: deployment.watcher_auth_key
value: "{{watcher_readonly_key}}"
- name: deployment.kalavai_api_endpoint
value: {{kalavai_api_endpoint}}
- name: deployment.user_node_label
value: "{{user_node_label}}"
- name: deployment.sleep_interval
value: {{helios_harvest_interval}}
- name: opencost
namespace: opencost
chart: opencost-charts/opencost
installed: {{is_public_pool}}
- name: prometheus
namespace: prometheus-system
chart: prometheus/prometheus
installed: {{is_public_pool}}
set:
- name: prometheus-pushgateway.enabled
value: false
- name: alertmanager.enabled
value: false
- name: volcano-sh
namespace: kalavai
chart: volcano-sh/volcano
installed: true
- name: kuberay
namespace: kuberay
chart: kuberay/kuberay-operator
Expand All @@ -25,24 +64,24 @@ releases:
namespace: kalavai
chart: longhorn/longhorn
installed: true
# set:
# # security issue! enable for testing only
# - name: service.ui.type
# value: NodePort
# - name: service.ui.nodePort
# value: $longhorn_ui_port
# - name: service.manager.type
# value: NodePort
# - name: service.manager.nodePort
# value: $longhorn_manager_port
set:
# security issue! enable for testing only
- name: service.ui.type
value: NodePort
- name: service.ui.nodePort
value: {{longhorn_ui_port}}
- name: service.manager.type
value: NodePort
- name: service.manager.nodePort
value: {{longhorn_manager_port}}
- name: lws
namespace: kalavai
chart: kalavai/lws
installed: true
installed: false
- name: kalavai-watcher
namespace: kalavai
chart: kalavai/kalavai-watcher
version: "0.2.8"
version: "0.2.15"
installed: true
set:
- name: namespace
Expand All @@ -54,37 +93,25 @@ releases:
- name: deployment.use_auth_key
value: "True"
- name: deployment.admin_key
value: "$watcher_admin_key"
value: "{{watcher_admin_key}}"
- name: deployment.write_key
value: "$watcher_write_key"
value: "{{watcher_write_key}}"
- name: deployment.readonly_key
value: "$watcher_readonly_key"
value: "{{watcher_readonly_key}}"
- name: deployment.is_public_pool
value: $is_public_pool
value: {{is_public_pool}}
- name: deployment.kalavai_api_endpoint
value: $kalavai_api_endpoint
value: {{kalavai_api_endpoint}}
- name: deployment.prometheus_endpoint
value: {{prometheus_endpoint}}
- name: deployment.opencost_endpoint
value: {{opencost_endpoint}}
- name: service.nodePort
value: $watcher_port
value: {{watcher_port}}
- name: nvidia-gpu-operator
namespace: kalavai
chart: kalavai/gpu
installed: true
- name: helios
namespace: kalavai
chart: kalavai/kalavai-helios
installed: $deploy_helios
set:
- name: deployment.watcher_endpoint
value: "http://$watcher_service"
- name: deployment.watcher_auth_key
value: "$watcher_readonly_key"
- name: deployment.kalavai_api_endpoint
value: "https://platform.kalavai.net/_/api"
- name: deployment.user_node_label
value: "$user_node_label"
- name: deployment.sleep_interval
value: 600

installed: true
- name: hami-vgpu
namespace: kalavai
chart: kalavai/hami
Expand Down
30 changes: 30 additions & 0 deletions assets/apps_values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# TODO: add helm versions here

- name: kalavai_api_endpoint
default: https://platform.kalavai.net/_/api
description: ""

- name: prometheus_endpoint
default: http://prometheus-server.prometheus-system.svc.cluster.local:80
description: ""

- name: opencost_endpoint
default: http://opencost.opencost.svc.cluster.local:9003
description: ""

- name: helios_harvest_interval
default: 60
description: "Interval (minutes) at which to report resource usage in public pools"

- name: longhorn_ui_port
default: "" #31010
description: ""

- name: longhorn_manager_port
default: "" #31011
description: ""

# - name: is_public_pool
# default: false
# description: "Interval (minutes) at which to report resource usage in public pools"

27 changes: 27 additions & 0 deletions assets/pool_config_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: longhorn-rwx
provisioner: driver.longhorn.io
allowVolumeExpansion: true
reclaimPolicy: Delete
volumeBindingMode: WaitForFirstConsumer
parameters:
shareManagerNodeSelector: {{sc_label_selector}}
numberOfReplicas: "{{sc_replicas}}"
staleReplicaTimeout: "2880"
fromBackup: ""
fsType: "ext4"
# ---
# apiVersion: scheduling.volcano.sh/v1beta1
# kind: Queue
# metadata:
# name: default-queue
# spec:
# weight: 1
# reclaimable: {{queue_reclaimable}}
# capability:
# cpu: {{queue_max_cpus}}
# memory: {{queue_max_memory}}Gi
# nvidia.com/gpu: {{queue_max_gpus}}

26 changes: 26 additions & 0 deletions assets/pool_config_values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# STORAGE #
- name: sc_label_selector
default: "kalavai.storage.enabled:True"
description: ""

- name: sc_replicas
default: 3
description: ""

# QUEUE #
# - name: queue_reclaimable
# default: "true"
# description: ""

# - name: queue_max_cpus
# default: 1000000
# description: ""

# - name: queue_max_gpus
# default: 1000000
# description: ""

# - name: queue_max_memory
# default: 1000000
# description: ""

0 comments on commit eaed16c

Please sign in to comment.