diff --git a/docs/source/user_guide/idle_culler.md b/docs/source/user_guide/idle_culler.md new file mode 100644 index 0000000000..350cb8ddff --- /dev/null +++ b/docs/source/user_guide/idle_culler.md @@ -0,0 +1,26 @@ +# Culling idle notebook servers + +Qhub uses a mix of the `idle culler `_ extension and internal Jupyterlab server cofiguration to periodically check for idle notebook servers and shut them down. + +JupyterHub pings the user's notebook server at certain time intervals. If no response is received from the +server during this checks and the timeout expires, the server is considered to be *inactive (idle)* and will +be culled. + +To help jupyterhub-idle-culler cull user servers, we configure the kernel manager to cull idle kernels that +would otherwise make the user servers report themselves as active which is part of what jupyterhub-idle-culler considers. + +*** +The expected behavior is that the server will be shut down and removed from the Qhub namespace once all Terminals and Kernels are considered idle or terminated, as well as any remaning connection is closed. +*** + +## Default settings + +By default, JupyterHub will ping the user notebook servers every 5 minutes to check their status. Every server found to be idle for more than 30 minutes will be terminated. + +Because the servers don't have a maximum age set, an active (has any open connection, terminal or kernel in execution ) server will not be shut down +regardless of how long it has been up and running. + +The process for culling and terminating follows these steps: +- Check if the Terminal and Notebooks kernels are idle for more than 15 minutes. With periodicaly culling checks of 5m. +- If the kernel is idle for more than 15 minutes, terminate the kernel and the server. +- Once no connections remains, after another 15m of no API calls from the user pod, the server is considered idle, and will be terminated. diff --git a/qhub/template/stages/07-kubernetes-services/jupyterhub.tf b/qhub/template/stages/07-kubernetes-services/jupyterhub.tf index d9ecdad051..4defc42dbd 100644 --- a/qhub/template/stages/07-kubernetes-services/jupyterhub.tf +++ b/qhub/template/stages/07-kubernetes-services/jupyterhub.tf @@ -107,7 +107,7 @@ module "jupyterhub" { name = "dask-etc" namespace = var.environment kind = "configmap" - } + }, } services = concat([ diff --git a/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/files/04-idle-culler.py b/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/files/04-idle-culler.py new file mode 100644 index 0000000000..96e551a231 --- /dev/null +++ b/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/files/04-idle-culler.py @@ -0,0 +1,38 @@ +# To help jupyterhub-idle-culler cull user servers, we configure the kernel manager to cull +# idle kernels that would otherwise make the user servers report themselves as active which +# is part of what jupyterhub-idle-culler considers. + +# Extra config available at: +# https://zero-to-jupyterhub.readthedocs.io/en/1.x/jupyterhub/customizing/user-management.html#culling-user-pods + +# Timeout (in seconds) in which a terminal has been inactive and ready to +# be culled. +c.TerminalManager.cull_inactive_timeout = 15 * 60 + +# The interval (in seconds) on which to check for terminals exceeding the +# inactive timeout value. +c.TerminalManager.cull_interval = 5 * 60 + +# cull_idle_timeout: timeout (in seconds) after which an idle kernel is +# considered ready to be culled +c.MappingKernelManager.cull_idle_timeout = 15 * 60 + +# cull_interval: the interval (in seconds) on which to check for idle +# kernels exceeding the cull timeout value +c.MappingKernelManager.cull_interval = 5 * 60 + +# cull_connected: whether to consider culling kernels which have one +# or more connections +c.MappingKernelManager.cull_connected = True + +# cull_busy: whether to consider culling kernels which are currently +# busy running some code +c.MappingKernelManager.cull_busy = False + +# Shut down the server after N seconds with no kernels or terminals +# running and no activity. +c.NotebookApp.shutdown_no_activity_timeout = 15 * 60 + +############################################################################### +# JupyterHub idle culler total timeout corresponds (approximately) to: +# max(cull_idle_timeout, cull_inactive_timeout) + shutdown_no_activity_timeout diff --git a/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/main.tf b/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/main.tf index e705941331..2b7a6d5923 100644 --- a/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/main.tf +++ b/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/main.tf @@ -10,6 +10,17 @@ resource "random_password" "proxy_secret_token" { special = false } +resource "kubernetes_config_map" "server-idle-culling" { + metadata { + name = "server-idle-culling" + namespace = var.namespace + } + + data = { + "jupyter_notebook_config.py" = file("${path.module}/files/04-idle-culler.py") + } +} + resource "helm_release" "jupyterhub" { name = "jupyterhub" namespace = var.namespace @@ -30,7 +41,16 @@ resource "helm_release" "jupyterhub" { shared-pvc = var.shared-pvc conda-store-pvc = var.conda-store-pvc conda-store-mount = var.conda-store-mount - extra-mounts = var.extra-mounts + extra-mounts = merge( + var.extra-mounts, + { + "/etc/jupyter" = { + name = "server-idle-culling" + namespace = var.namespace + kind = "configmap" + } + } + ) environments = var.conda-store-environments } @@ -41,9 +61,9 @@ resource "helm_release" "jupyterhub" { } extraConfig = { - "01-theme.py" = file("${path.module}/files/01-theme.py") - "02-spawner.py" = file("${path.module}/files/02-spawner.py") - "03-profiles.py" = file("${path.module}/files/03-profiles.py") + "01-theme.py" = file("${path.module}/files/01-theme.py") + "02-spawner.py" = file("${path.module}/files/02-spawner.py") + "03-profiles.py" = file("${path.module}/files/03-profiles.py") } services = { diff --git a/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/values.yaml b/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/values.yaml index 2d4755d21c..9f849698c3 100644 --- a/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/values.yaml +++ b/qhub/template/stages/07-kubernetes-services/modules/kubernetes/services/jupyterhub/values.yaml @@ -43,3 +43,19 @@ singleuser: guarantee: "1G" networkPolicy: enabled: false + +# cull relates to the jupyterhub-idle-culler service, responsible for evicting +# inactive singleuser pods. +# +# The configuration below, except for enabled, corresponds to command-line flags +# for jupyterhub-idle-culler as documented here: +# https://github.com/jupyterhub/jupyterhub-idle-culler#as-a-standalone-script +# +cull: + enabled: true + users: false + removeNamedServers: false + timeout: 1800 + every: 600 + concurrency: 10 + maxAge: 0