Skip to content

Commit

Permalink
feat: Tuning job update supporting (#580)
Browse files Browse the repository at this point in the history
**Reason for Change**:

1. Update tuning jobs when tuning input or output changes are detected
2. Add e2e tests
3. Add unit test for update

**Requirements**

- [ ] added unit tests and e2e tests (if applicable).

**Issue Fixed**:
<!-- If this PR fixes GitHub issue 4321, add "Fixes #4321" to the next
line. -->

**Notes for Reviewers**:

---------

Signed-off-by: Bangqi Zhu <bangqizhu@microsoft.com>
Co-authored-by: Bangqi Zhu <bangqizhu@microsoft.com>
  • Loading branch information
bangqipropel and Bangqi Zhu authored Aug 30, 2024
1 parent d346213 commit 8785cc5
Show file tree
Hide file tree
Showing 13 changed files with 189 additions and 143 deletions.
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,19 @@ docker-build-adapter: docker-buildx
.PHONY: docker-build-dataset
docker-build-dataset: docker-buildx
docker buildx build \
--file ./docker/dataset/Dockerfile \
--build-arg ADAPTER_PATH=docker/datasets/dataset1 \
--file ./docker/datasets/Dockerfile \
--output=$(OUTPUT_TYPE) \
--platform="linux/$(ARCH)" \
--pull \
--tag $(REGISTRY)/e2e-dataset:0.0.1 .
docker buildx build \
--build-arg ADAPTER_PATH=docker/datasets/dataset2 \
--file ./docker/datasets/Dockerfile \
--output=$(OUTPUT_TYPE) \
--platform="linux/$(ARCH)" \
--pull \
--tag $(REGISTRY)/e2e-dataset2:0.0.1 .

## --------------------------------------
## Kaito Installation
Expand Down
40 changes: 4 additions & 36 deletions api/v1alpha1/workspace_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ package v1alpha1
import (
"context"
"fmt"
"github.com/azure/kaito/pkg/utils/consts"
"reflect"
"regexp"
"sort"
"strconv"
"strings"

"github.com/azure/kaito/pkg/utils/consts"

"github.com/azure/kaito/pkg/utils"
"github.com/azure/kaito/pkg/utils/plugin"

Expand Down Expand Up @@ -183,7 +183,7 @@ func (r *TuningSpec) validateUpdate(old *TuningSpec) (errs *apis.FieldError) {
if r.Output == nil {
errs = errs.Also(apis.ErrMissingField("Output"))
} else {
errs = errs.Also(r.Output.validateUpdate(old.Output).ViaField("Output"))
errs = errs.Also(r.Output.validateUpdate().ViaField("Output"))
}
if !reflect.DeepEqual(old.Preset, r.Preset) {
errs = errs.Also(apis.ErrGeneric("Preset cannot be changed", "Preset"))
Expand Down Expand Up @@ -235,33 +235,7 @@ func (r *DataSource) validateUpdate(old *DataSource, isTuning bool) (errs *apis.
if r.Volume != nil {
errs = errs.Also(apis.ErrInvalidValue("Volume support is not implemented yet", "Volume"))
}
oldURLs := make([]string, len(old.URLs))
copy(oldURLs, old.URLs)
sort.Strings(oldURLs)

newURLs := make([]string, len(r.URLs))
copy(newURLs, r.URLs)
sort.Strings(newURLs)

if !reflect.DeepEqual(oldURLs, newURLs) {
errs = errs.Also(apis.ErrInvalidValue("URLs field cannot be changed once set", "URLs"))
}
// TODO: check if the Volume is changed
if old.Image != r.Image {
errs = errs.Also(apis.ErrInvalidValue("Image field cannot be changed once set", "Image"))
}

oldSecrets := make([]string, len(old.ImagePullSecrets))
copy(oldSecrets, old.ImagePullSecrets)
sort.Strings(oldSecrets)

newSecrets := make([]string, len(r.ImagePullSecrets))
copy(newSecrets, r.ImagePullSecrets)
sort.Strings(newSecrets)

if !reflect.DeepEqual(oldSecrets, newSecrets) {
errs = errs.Also(apis.ErrInvalidValue("ImagePullSecrets field cannot be changed once set", "ImagePullSecrets"))
}
return errs
}

Expand Down Expand Up @@ -298,18 +272,12 @@ func (r *DataDestination) validateCreate() (errs *apis.FieldError) {
return errs
}

func (r *DataDestination) validateUpdate(old *DataDestination) (errs *apis.FieldError) {
func (r *DataDestination) validateUpdate() (errs *apis.FieldError) {
// TODO: Implement Volumes
if r.Volume != nil {
errs = errs.Also(apis.ErrInvalidValue("Volume support is not implemented yet", "Volume"))
}
if old.Image != r.Image {
errs = errs.Also(apis.ErrInvalidValue("Image field cannot be changed once set", "Image"))
}

if old.ImagePushSecret != r.ImagePushSecret {
errs = errs.Also(apis.ErrInvalidValue("ImagePushSecret field cannot be changed once set", "ImagePushSecret"))
}
return errs
}

Expand Down
57 changes: 1 addition & 56 deletions api/v1alpha1/workspace_validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1302,39 +1302,6 @@ func TestDataSourceValidateUpdate(t *testing.T) {
wantErr: true,
errFields: []string{"Name"},
},
{
name: "URLs changed",
oldSource: &DataSource{
URLs: []string{"http://example.com/old"},
},
newSource: &DataSource{
URLs: []string{"http://example.com/new"},
},
wantErr: true,
errFields: []string{"URLs"},
},
{
name: "Image changed",
oldSource: &DataSource{
Image: "old-image:latest",
},
newSource: &DataSource{
Image: "new-image:latest",
},
wantErr: true,
errFields: []string{"Image"},
},
{
name: "ImagePullSecrets changed",
oldSource: &DataSource{
ImagePullSecrets: []string{"old-secret"},
},
newSource: &DataSource{
ImagePullSecrets: []string{"new-secret"},
},
wantErr: true,
errFields: []string{"ImagePullSecrets"},
},
}

for _, tt := range tests {
Expand Down Expand Up @@ -1450,33 +1417,11 @@ func TestDataDestinationValidateUpdate(t *testing.T) {
},
wantErr: false,
},
{
name: "Image changed",
oldDest: &DataDestination{
Image: "old-image:latest",
},
newDest: &DataDestination{
Image: "new-image:latest",
},
wantErr: true,
errFields: []string{"Image"},
},
{
name: "ImagePushSecret changed",
oldDest: &DataDestination{
ImagePushSecret: "old-secret",
},
newDest: &DataDestination{
ImagePushSecret: "new-secret",
},
wantErr: true,
errFields: []string{"ImagePushSecret"},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
errs := tt.newDest.validateUpdate(tt.oldDest)
errs := tt.newDest.validateUpdate()
hasErrs := errs != nil

if hasErrs != tt.wantErr {
Expand Down
5 changes: 0 additions & 5 deletions docker/dataset/Dockerfile

This file was deleted.

21 changes: 0 additions & 21 deletions docker/dataset/README.md

This file was deleted.

7 changes: 7 additions & 0 deletions docker/datasets/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM busybox:latest

ARG DATASET_PATH=docker/datasets/dataset1

RUN mkdir -p /data

COPY ${DATASET_PATH}/dataset.parquet /data/
21 changes: 21 additions & 0 deletions docker/datasets/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# E2E Fine-Tuning Dataset Files

## Overview

The dataset files are used for conducting end-to-end (E2E) testing for fine-tuning. The Dockerfile builds an image incorporating the [dolly-15k-oai-style](https://huggingface.co/datasets/philschmid/dolly-15k-oai-style) and [kubernetes-reformatted-remove-outliers](https://huggingface.co/datasets/ishaansehgal99/kubernetes-reformatted-remove-outliers) datasets, which are then used within init containers specifically for fine-tuning.

## Files

- **Dockerfile**: Builds the Docker image for the E2E tests.

- **dataset.parquet**: The datasets themselves, downloaded from [dolly-15k-oai-style](https://huggingface.co/datasets/philschmid/dolly-15k-oai-style) and [kubernetes-reformatted-remove-outliers](https://huggingface.co/datasets/ishaansehgal99/kubernetes-reformatted-remove-outliers)


## Usage

Build the Docker images with the following command:

```bash

make docker-build-dataset

File renamed without changes.
Binary file added docker/datasets/dataset2/dataset.parquet
Binary file not shown.
20 changes: 19 additions & 1 deletion pkg/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -704,15 +704,33 @@ func (c *WorkspaceReconciler) applyTuning(ctx context.Context, wObj *kaitov1alph

tuningParam := model.GetTuningParameters()
existingObj := &batchv1.Job{}
revisionNum := wObj.Annotations[kaitov1alpha1.WorkspaceRevisionAnnotation]
if err = resources.GetResource(ctx, wObj.Name, wObj.Namespace, c.Client, existingObj); err == nil {
klog.InfoS("A tuning workload already exists for workspace", "workspace", klog.KObj(wObj))

if existingObj.Annotations[kaitov1alpha1.WorkspaceRevisionAnnotation] != revisionNum {
deletePolicy := metav1.DeletePropagationForeground
if err := c.Delete(ctx, existingObj, &client.DeleteOptions{
PropagationPolicy: &deletePolicy,
}); err != nil {
return
}

var workloadObj client.Object
workloadObj, err = tuning.CreatePresetTuning(ctx, wObj, revisionNum, tuningParam, c.Client)
if err != nil {
return
}
existingObj = workloadObj.(*batchv1.Job)
}

if err = resources.CheckResourceStatus(existingObj, c.Client, tuningParam.ReadinessTimeout); err != nil {
return
}
} else if apierrors.IsNotFound(err) {
var workloadObj client.Object
// Need to create a new workload
workloadObj, err = tuning.CreatePresetTuning(ctx, wObj, tuningParam, c.Client)
workloadObj, err = tuning.CreatePresetTuning(ctx, wObj, revisionNum, tuningParam, c.Client)
if err != nil {
return
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/resources/manifests.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ func GenerateStatefulSetManifest(ctx context.Context, workspaceObj *kaitov1alpha
return ss
}

func GenerateTuningJobManifest(ctx context.Context, wObj *kaitov1alpha1.Workspace, imageName string,
func GenerateTuningJobManifest(ctx context.Context, wObj *kaitov1alpha1.Workspace, revisionNum string, imageName string,
imagePullSecretRefs []corev1.LocalObjectReference, replicas int, commands []string, containerPorts []corev1.ContainerPort,
livenessProbe, readinessProbe *corev1.Probe, resourceRequirements corev1.ResourceRequirements, tolerations []corev1.Toleration,
initContainers []corev1.Container, sidecarContainers []corev1.Container, volumes []corev1.Volume, volumeMounts []corev1.VolumeMount,
Expand Down Expand Up @@ -225,6 +225,9 @@ func GenerateTuningJobManifest(ctx context.Context, wObj *kaitov1alpha1.Workspac
Name: wObj.Name,
Namespace: wObj.Namespace,
Labels: labels,
Annotations: map[string]string{
kaitov1alpha1.WorkspaceRevisionAnnotation: revisionNum,
},
OwnerReferences: []v1.OwnerReference{
{
APIVersion: kaitov1alpha1.GroupVersion.String(),
Expand Down
4 changes: 2 additions & 2 deletions pkg/tuning/preset-tuning.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace, cmName str
return volumes, volumeMounts
}

func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace,
func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace, revisionNum string,
tuningObj *model.PresetParam, kubeClient client.Client) (client.Object, error) {
cm, err := EnsureTuningConfigMap(ctx, workspaceObj, kubeClient)
if err != nil {
Expand Down Expand Up @@ -348,7 +348,7 @@ func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspa
Value: "k_proj,q_proj,v_proj,o_proj,gate_proj,down_proj,up_proj",
})
}
jobObj := resources.GenerateTuningJobManifest(ctx, workspaceObj, tuningImage, imagePullSecrets, *workspaceObj.Resource.Count, commands,
jobObj := resources.GenerateTuningJobManifest(ctx, workspaceObj, revisionNum, tuningImage, imagePullSecrets, *workspaceObj.Resource.Count, commands,
containerPorts, nil, nil, resourceReq, tolerations, initContainers, sidecarContainers, volumes, volumeMounts, envVars)

err = resources.CreateResource(ctx, jobObj, kubeClient)
Expand Down
Loading

0 comments on commit 8785cc5

Please sign in to comment.