Skip to content

Commit

Permalink
Slurm on GKE - Guide (#864)
Browse files Browse the repository at this point in the history
* Slurm on GKE - Guide and code

* Update README.md

* remove empty line

* add new lines at the end of each file

* remove embedded image

* modules moved to the shared modules directory

* module references updated

* Update README.md

* Update README.md

* Update README.md

* Revert "Update README.md"

This reverts commit 76703f0.

* pinned version

* Update providers.tf
  • Loading branch information
danielmarzini authored Nov 18, 2024
1 parent d72c274 commit ee94bdc
Show file tree
Hide file tree
Showing 33 changed files with 2,575 additions and 0 deletions.
50 changes: 50 additions & 0 deletions modules/slurm-cluster/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/**
* Copyright 2024 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


locals {
wl_templates = [
for f in fileset(local.wl_templates_path, "[0-9]*yml") :
"${local.wl_templates_path}/${f}"
]
wl_templates_path = (
var.templates_path == null
? "${path.module}/manifest-templates"
: pathexpand(var.templates_path)
)
}

resource "kubernetes_namespace" "default" {
count = var.namespace_create ? 1 : 0
metadata {
name = var.namespace
}
}

resource "kubernetes_manifest" "default" {
for_each = toset(local.wl_templates)
manifest = yamldecode(templatefile(each.value, {
namespace = var.namespace
cluster_config = var.cluster_config
}))

timeouts {
create = "30m"
}
field_manager {
force_conflicts = true
}
}
123 changes: 123 additions & 0 deletions modules/slurm-cluster/manifest-templates/00-configmap-slurm-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

apiVersion: v1
kind: ConfigMap
metadata:
name: slurm-conf-configmap
namespace: ${namespace}
data:
slurm.conf: |
# slurm.conf
#
# See the slurm.conf man page for more information.
#
ClusterName=linux
SlurmctldHost=slurmctld-0
#
SlurmUser=slurm
SlurmctldPort=6820-6830
SlurmdPort=6818
AuthType=auth/munge
StateSaveLocation=/var/spool/slurmctld
SlurmdSpoolDir=/var/spool/slurmd
SwitchType=switch/none
MpiDefault=pmix
SlurmctldPidFile=/var/run/slurmd/slurmctld.pid
SlurmdPidFile=/var/run/slurmd/slurmd.pid
ProctrackType=proctrack/linuxproc
ReturnToService=2
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=30
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_CPU_Memory
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurm/slurmd.log
JobCompType=jobcomp/filetxt
JobCompLoc=/var/log/slurm/jobcomp.log
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=slurmdbd
AccountingStoragePort=6819
#
SlurmctldParameters=cloud_reg_addrs
# CLOUD CONFIGURATIONS
MaxNodeCount=64000
include cloud.conf
cloud.conf: |
PrivateData=cloud
SlurmctldParameters=enable_configless
## GRES
GresTypes=gpu
AccountingStorageTRES=gres/gpu
DebugFlags=Gres
TreeWidth=128
# NODES
NodeName=DEFAULT State=UNKNOWN RealMemory=15000 CPUs=4 CoresPerSocket=2 ThreadsPerCore=2 Gres=gpu:1
NodeName=slurmd-[0-39] State=CLOUD Gres=gpu:1
NodeSet=slurmdnodeset Nodes=slurmd-[0-39]
NodeName=DEFAULT State=UNKNOWN RealMemory=30000 CPUs=8 CoresPerSocket=2 ThreadsPerCore=2 Gres=gpu:2
NodeName=slurmd1-[0-39] State=CLOUD Gres=gpu:2
NodeSet=slurmd1nodeset Nodes=slurmd1-[0-39]
# PARTITIONS
PartitionName=all Default=yes Nodes=ALL MaxTime=INFINITE State=UP
PropagateResourceLimitsExcept=MEMLOCK
PartitionName=1gpunodes Nodes=slurmdnodeset State=UP DefMemPerCPU=7007 SuspendTime=300 Oversubscribe=Exclusive PowerDownOnIdle=YES ResumeTimeout=300 SuspendTimeout=120
PartitionName=2gpunodes Nodes=slurmd1nodeset State=UP DefMemPerCPU=7007 SuspendTime=300 Oversubscribe=Exclusive PowerDownOnIdle=YES ResumeTimeout=300 SuspendTimeout=120
cloud_gres.conf: |
NodeName=slurmd-[0-39] Name=gpu File=/dev/nvidia0
NodeName=slurmd1-[0-39] Name=gpu File=/dev/nvidia[0-1]
gres.conf: |
NodeName=slurmd-[0-39] Name=gpu File=/dev/nvidia0
NodeName=slurmd1-[0-39] Name=gpu File=/dev/nvidia[0-1]
cgroup.conf: |
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
ConstrainCores=yes
ConstrainDevices=yes
ConstrainRAMSpace=yes
ConstrainSwapSpace=yes
IgnoreSystemd=yes
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: ConfigMap
metadata:
name: slurmdbd-conf-configmap
namespace: ${namespace}
data:
slurmdbd.conf: |
#
# Example slurmdbd.conf file.
#
# See the slurmdbd.conf man page for more information.
#
# Authentication info
AuthType=auth/munge
#
# slurmDBD info
DbdAddr=slurmdbd
DbdHost=slurmdbd
SlurmUser=slurm
DebugLevel=4
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd/slurmdbd.pid
#
# Database info
StorageType=accounting_storage/mysql
StorageHost=${cluster_config.database.host}
StorageUser=${cluster_config.database.user}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: Secret
metadata:
name: database-auth-secret
namespace: ${namespace}
data:
password: ${cluster_config.database.password}
29 changes: 29 additions & 0 deletions modules/slurm-cluster/manifest-templates/00-secret-munge-key.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: Secret
metadata:
name: munge-key-secret
namespace: ${namespace}
data:
munge.key: ${base64encode(cluster_config.munge.key)}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: slurm-shared-storage
namespace: ${namespace}
spec:
storageClassName: standard-rwx
accessModes:
- ReadWriteMany
resources:
requests:
storage: ${cluster_config.storage.size_gb}Gi
36 changes: 36 additions & 0 deletions modules/slurm-cluster/manifest-templates/01-pvc-var-lib-mysql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app.kubernetes.io/name: slurm
app.kubernetes.io/component: mysql
name: var-lib-mysql
namespace: ${namespace}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: ${cluster_config.database.storage_size_gb}Gi
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# MIT License

# Copyright (c) 2019 Giovanni Torres

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app.kubernetes.io/name: slurm
app.kubernetes.io/component: slurmctld
name: var-spool-slurmctld
namespace: ${namespace}
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Mi
Loading

0 comments on commit ee94bdc

Please sign in to comment.