Skip to content

Commit

Permalink
Add Helm chart for kubeflow trainer
Browse files Browse the repository at this point in the history
Signed-off-by: Yi Chen <github@chenyicn.net>
  • Loading branch information
ChenYi015 committed Feb 13, 2025
1 parent 112dc75 commit 325bb3a
Show file tree
Hide file tree
Showing 20 changed files with 24,290 additions and 0 deletions.
62 changes: 62 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,21 @@ endif
SHELL = /usr/bin/env bash -o pipefail
.SHELLFLAGS = -ec

REPO := github.com/kubeflow/trainer
TRAINER_CHART_PATH := charts/trainer

## Location to install binaries
LOCALBIN ?= $(shell pwd)/bin

## Versions
HELM_VERSION ?= v3.15.3
HELM_UNITTEST_VERSION ?= 0.5.1
HELM_DOCS_VERSION ?= v1.14.2

## Binaries
HELM ?= $(LOCALBIN)/helm-$(HELM_VERSION)
HELM_DOCS ?= $(LOCALBIN)/helm-docs-$(HELM_DOCS_VERSION)

##@ General

# The help target prints out all targets with their descriptions organized
Expand Down Expand Up @@ -124,3 +139,50 @@ test-python-integration: ## Run Python integration test.
pip install -r ./cmd/initializer/dataset/requirements.txt

pytest ./test/integration/initializer

##@ Helm

.PHONY: helm-unittest
helm-unittest: helm-unittest-plugin ## Run Helm chart unittests.
$(HELM) unittest $(TRAINER_CHART_PATH) --strict --file "tests/**/*_test.yaml"

.PHONY: helm-lint
helm-lint: ## Run Helm chart lint test.
docker run --rm --workdir /workspace --volume "$$(pwd):/workspace" quay.io/helmpack/chart-testing:latest ct lint --target-branch master --validate-maintainers=false

.PHONY: helm-docs
helm-docs: helm-docs-plugin ## Generates markdown documentation for helm charts from requirements and values files.
$(HELM_DOCS) --sort-values-order=file

##@ Dependencies

.PHONY: helm
helm: $(HELM) ## Download helm locally if necessary.
$(HELM): $(LOCALBIN)
$(call go-install-tool,$(HELM),helm.sh/helm/v3/cmd/helm,$(HELM_VERSION))

.PHONY: helm-unittest-plugin
helm-unittest-plugin: helm ## Download helm unittest plugin locally if necessary.
if [ -z "$(shell $(HELM) plugin list | grep unittest)" ]; then \
echo "Installing helm unittest plugin"; \
$(HELM) plugin install https://github.com/helm-unittest/helm-unittest.git --version $(HELM_UNITTEST_VERSION); \
fi

.PHONY: helm-docs-plugin
helm-docs-plugin: $(HELM_DOCS) ## Download helm-docs plugin locally if necessary.
$(HELM_DOCS): $(LOCALBIN)
$(call go-install-tool,$(HELM_DOCS),github.com/norwoodj/helm-docs/cmd/helm-docs,$(HELM_DOCS_VERSION))

# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
# $1 - target path with name of binary (ideally with version)
# $2 - package url which can be installed
# $3 - specific version of package
define go-install-tool
@[ -f $(1) ] || { \
set -e; \
package=$(2)@$(3) ;\
echo "Downloading $${package}" ;\
GOBIN=$(LOCALBIN) go install $${package} ;\
mv "$$(echo "$(1)" | sed "s/-$(3)$$//")" $(1) ;\
}
endef
23 changes: 23 additions & 0 deletions charts/trainer/.helmignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
37 changes: 37 additions & 0 deletions charts/trainer/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#
# Copyright 2024 The Kubeflow authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

apiVersion: v2

name: trainer

description: A Helm chart for deploying Kubeflow trainer on Kubernetes.

version: 2.0.0

appVersion: 2.0.0

type: application

keywords:
- kubeflow trainer

home: https://github.com/kubeflow/trainer

maintainers:
- name: ChenYi015
email: github@chenyicn.net
url: https://github.com/ChenYi015
105 changes: 105 additions & 0 deletions charts/trainer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# trainer

![Version: 2.0.0](https://img.shields.io/badge/Version-2.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.0.0](https://img.shields.io/badge/AppVersion-2.0.0-informational?style=flat-square)

A Helm chart for deploying Kubeflow trainer on Kubernetes.

**Homepage:** <https://github.com/kubeflow/trainer>

## Introduction

This chart bootstraps a [Kubernetes Trainer](https://github.com/kubeflow/trainer) deployment using the [Helm](https://helm.sh) package manager.

## Prerequisites

- Helm >= 3
- Kubernetes >= 1.20

## Usage

### Add Helm Repo

```bash
helm repo add trainer https://kubeflow.github.io/trainer

helm repo update
```

See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation.

### Install the chart

```bash
helm install [RELEASE_NAME] trainer/trainer
```

For example, if you want to create a release with name `trainer` in the `kubeflow-system` namespace:

```shell
helm install trainer trainer/trainer \
--namespace kubeflow-system \
--create-namespace
```

Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist.

See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation.

### Upgrade the chart

```shell
helm upgrade [RELEASE_NAME] trainer/trainer [flags]
```

See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation.

### Uninstall the chart

```shell
helm uninstall [RELEASE_NAME]
```

This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually.

See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation.

## Values

| Key | Type | Default | Description |
|-----|------|---------|-------------|
| nameOverride | string | `""` | String to partially override release name. |
| fullnameOverride | string | `""` | String to fully override release name. |
| commonLabels | object | `{}` | Common labels to add to the resources. |
| image.registry | string | `"docker.io"` | Image registry. |
| image.repository | string | `"kubeflow/trainer-controller-controller"` | Image repository. |
| image.tag | string | If not set, the chart appVersion will be used. | Image tag. |
| image.pullPolicy | string | `"IfNotPresent"` | Image pull policy. |
| image.pullSecrets | list | `[]` | Image pull secrets for private image registry. |
| controller.replicas | int | `1` | Number of replicas of controller. |
| controller.labels | object | `{}` | Extra labels for controller pods. |
| controller.annotations | object | `{}` | Extra annotations for controller pods. |
| controller.volumes | list | `[]` | Volumes for controller pods. |
| controller.nodeSelector | object | `{}` | Node selector for controller pods. |
| controller.affinity | object | `{}` | Affinity for controller pods. |
| controller.tolerations | list | `[]` | List of node taints to tolerate for controller pods. |
| controller.env | list | `[]` | Environment variables for controller containers. |
| controller.envFrom | list | `[]` | Environment variable sources for controller containers. |
| controller.volumeMounts | list | `[]` | Volume mounts for controller containers. |
| controller.resources | object | `{}` | Pod resource requests and limits for controller containers. |
| controller.securityContext | object | `{}` | Security context for controller containers. |
| controller.serviceAccount.create | bool | `true` | Specifies whether to create a service account for the controller. |
| controller.serviceAccount.name | string | `""` | Optional name for the controller service account. |
| controller.serviceAccount.annotations | object | `{}` | Extra annotations for the controller service account. |
| controller.serviceAccount.automountServiceAccountToken | bool | `true` | Auto-mount service account token to the controller pods. |
| webhook.enable | bool | `true` | Specifies whether to enable webhook. |
| webhook.failurePolicy | string | `"Fail"` | Specifies how unrecognized errors are handled. Available options are `Ignore` or `Fail`. |
| runtime.preTraining.torchDistributed.enable | bool | `true` | |
| runtime.preTraining.torchDistributed.image.registry | string | `"docker.io"` | |
| runtime.preTraining.torchDistributed.image.repository | string | `"pytorch/pytorch"` | |
| runtime.preTraining.torchDistributed.image.tag | string | `"2.5.0-cuda12.4-cudnn9-runtime"` | |

## Maintainers

| Name | Email | Url |
| ---- | ------ | --- |
| ChenYi015 | <github@chenyicn.net> | <https://github.com/ChenYi015> |
70 changes: 70 additions & 0 deletions charts/trainer/README.md.gotmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{{ template "chart.header" . }}

{{ template "chart.deprecationWarning" . }}

{{ template "chart.badgesSection" . }}

{{ template "chart.description" . }}

{{ template "chart.homepageLine" . }}

## Introduction

This chart bootstraps a [Kubernetes Trainer]({{template "chart.homepage" . }}) deployment using the [Helm](https://helm.sh) package manager.

## Prerequisites

- Helm >= 3
- Kubernetes >= 1.20

## Usage

### Add Helm Repo

```bash
helm repo add trainer https://kubeflow.github.io/trainer

helm repo update
```

See [helm repo](https://helm.sh/docs/helm/helm_repo) for command documentation.

### Install the chart

```bash
helm install [RELEASE_NAME] trainer/trainer
```

For example, if you want to create a release with name `trainer` in the `kubeflow-system` namespace:

```shell
helm install trainer trainer/trainer \
--namespace kubeflow-system \
--create-namespace
```

Note that by passing the `--create-namespace` flag to the `helm install` command, `helm` will create the release namespace if it does not exist.

See [helm install](https://helm.sh/docs/helm/helm_install) for command documentation.

### Upgrade the chart

```shell
helm upgrade [RELEASE_NAME] trainer/trainer [flags]
```

See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade) for command documentation.

### Uninstall the chart

```shell
helm uninstall [RELEASE_NAME]
```

This removes all the Kubernetes resources associated with the chart and deletes the release, except for the `crds`, those will have to be removed manually.

See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command documentation.

{{ template "chart.valuesSection" . }}

{{ template "chart.maintainersSection" . }}
Loading

0 comments on commit 325bb3a

Please sign in to comment.