Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Keep Docker Sidecar Alive for Data Retrieval #552

Merged
merged 18 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions .github/workflows/e2e-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,29 @@ jobs:
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
REGISTRY: ${{ env.REGISTRY }}
VERSION: ${{ env.VERSION }}

- name: Add Secret Credentials

# Retrieve E2E ACR credentials and create Kubernetes secret
- name: Set up E2E ACR Credentials and Secret
shell: bash
run: |
# Retrieve the ACR username and password
ACR_USERNAME=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "username" -o tsv)
ACR_PASSWORD=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "passwords[0].value" -o tsv)

# Ensure credentials were retrieved successfully
if [ -z "$ACR_USERNAME" ] || [ -z "$ACR_PASSWORD" ]; then
echo "Failed to retrieve ACR credentials"
exit 1
fi

# Create the Kubernetes secret with the retrieved credentials
kubectl create secret docker-registry ${{ env.CLUSTER_NAME }}-acr-secret \
--docker-server=${{ env.CLUSTER_NAME }}.azurecr.io \
--docker-username=${ACR_USERNAME} \
--docker-password=${ACR_PASSWORD}

# Add Private-Hosted ACR secret for private models like llama
- name: Add Private-Hosted ACR Secret Credentials
run: |
kubectl create secret docker-registry ${{ secrets.E2E_AMRT_SECRET_NAME }} \
--docker-server=${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io \
Expand All @@ -218,6 +239,7 @@ jobs:
AI_MODELS_REGISTRY: ${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io
AI_MODELS_REGISTRY_SECRET: ${{ secrets.E2E_AMRT_SECRET_NAME }}
E2E_ACR_REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io
E2E_ACR_REGISTRY_SECRET: ${{ env.CLUSTER_NAME }}-acr-secret

- name: Cleanup e2e resources
if: ${{ always() }}
Expand Down
12 changes: 12 additions & 0 deletions api/v1alpha1/workspace_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,12 @@ func (r *DataSource) validateCreate() (errs *apis.FieldError) {
re := regexp.MustCompile(`^(.+/[^:/]+):([^:/]+)$`)
if !re.MatchString(r.Image) {
errs = errs.Also(apis.ErrInvalidValue("Invalid image format, require full input image URL", "Image"))
} else {
// Executes if image is of correct format
err := utils.ExtractAndValidateRepoName(r.Image)
if err != nil {
errs = errs.Also(apis.ErrInvalidValue(err.Error(), "Image"))
}
}
sourcesSpecified++
}
Expand Down Expand Up @@ -271,6 +277,12 @@ func (r *DataDestination) validateCreate() (errs *apis.FieldError) {
re := regexp.MustCompile(`^(.+/[^:/]+):([^:/]+)$`)
if !re.MatchString(r.Image) {
errs = errs.Also(apis.ErrInvalidValue("Invalid image format, require full output image URL", "Image"))
} else {
// Executes if image is of correct format
err := utils.ExtractAndValidateRepoName(r.Image)
if err != nil {
errs = errs.Also(apis.ErrInvalidValue(err.Error(), "Image"))
}
}
// Cloud Provider requires credentials to push image
if r.ImagePushSecret == "" {
Expand Down
22 changes: 22 additions & 0 deletions api/v1alpha1/workspace_validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,28 @@ func TestTuningSpecValidateCreate(t *testing.T) {
wantErr: true,
errFields: []string{"Method"},
},
{
name: "Invalid Input Source Casing",
tuningSpec: &TuningSpec{
Input: &DataSource{Name: "valid-input", Image: "AZURE_ACR.azurecr.io/INPUT:0.0.0"},
Output: &DataDestination{Image: "AZURE_ACR.azurecr.io/output:0.0.0", ImagePushSecret: "secret"},
Preset: &PresetSpec{PresetMeta: PresetMeta{Name: ModelName("test-validation")}},
Method: TuningMethodLora,
},
wantErr: true,
errFields: []string{"Image"},
},
{
name: "Invalid Output Destination Casing",
tuningSpec: &TuningSpec{
Input: &DataSource{Name: "valid-input", Image: "AZURE_ACR.azurecr.io/input:0.0.0"},
Output: &DataDestination{Image: "AZURE_ACR.azurecr.io/OUTPUT:0.0.0", ImagePushSecret: "secret"},
Preset: &PresetSpec{PresetMeta: PresetMeta{Name: ModelName("test-validation")}},
Method: TuningMethodLora,
},
wantErr: true,
errFields: []string{"Image"},
},
}

for _, tt := range tests {
Expand Down
11 changes: 7 additions & 4 deletions pkg/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,19 @@
if job.Status.Succeeded > 0 {
if updateErr := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeSucceeded, metav1.ConditionTrue,
"workspaceSucceeded", "workspace succeeds"); updateErr != nil {
klog.ErrorS(err, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, err
klog.ErrorS(updateErr, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, updateErr

Check warning on line 161 in pkg/controllers/workspace_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/workspace_controller.go#L160-L161

Added lines #L160 - L161 were not covered by tests
}
} else { // The job is still running
if updateErr := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeSucceeded, metav1.ConditionFalse,
"workspacePending", "workspace has not completed"); updateErr != nil {
klog.ErrorS(err, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, err
klog.ErrorS(updateErr, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, updateErr

Check warning on line 167 in pkg/controllers/workspace_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/workspace_controller.go#L166-L167

Added lines #L166 - L167 were not covered by tests
}
}
} else {
klog.ErrorS(err, "failed to get job resource", "workspace", klog.KObj(wObj))
return reconcile.Result{}, err

Check warning on line 172 in pkg/controllers/workspace_controller.go

View check run for this annotation

Codecov / codecov/patch

pkg/controllers/workspace_controller.go#L170-L172

Added lines #L170 - L172 were not covered by tests
}
} else if wObj.Inference != nil {
if err := c.ensureService(ctx, wObj); err != nil {
Expand Down
67 changes: 41 additions & 26 deletions pkg/tuning/preset-tuning.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,34 +154,50 @@
done
echo 'Docker daemon started'

PUSH_SUCCEEDED=false

Check warning on line 157 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L157

Added line #L157 was not covered by tests

while true; do
FILE_PATH=$(find %s -name 'fine_tuning_completed.txt')
if [ ! -z "$FILE_PATH" ]; then
echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH"

PARENT_DIR=$(dirname "$FILE_PATH")
echo "Parent directory is $PARENT_DIR"

TEMP_CONTEXT=$(mktemp -d)
cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json"
cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors"

# Create a minimal Dockerfile
echo 'FROM busybox:latest
RUN mkdir -p /data
ADD adapter_config.json /data/
ADD adapter_model.safetensors /data/' > "$TEMP_CONTEXT/Dockerfile"

docker build -t %s "$TEMP_CONTEXT"
docker push %s

# Cleanup: Remove the temporary directory
rm -rf "$TEMP_CONTEXT"

# Remove the file to prevent repeated builds
rm "$FILE_PATH"
echo "Upload complete"
exit 0
if [ "$PUSH_SUCCEEDED" = false ]; then
echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH"

Check warning on line 163 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L162-L163

Added lines #L162 - L163 were not covered by tests

PARENT_DIR=$(dirname "$FILE_PATH")
echo "Parent directory is $PARENT_DIR"

Check warning on line 166 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L165-L166

Added lines #L165 - L166 were not covered by tests

TEMP_CONTEXT=$(mktemp -d)
cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json"
cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors"

Check warning on line 170 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L168-L170

Added lines #L168 - L170 were not covered by tests

# Create a minimal Dockerfile
echo 'FROM busybox:latest
RUN mkdir -p /data
ADD adapter_config.json /data/
ADD adapter_model.safetensors /data/' > "$TEMP_CONTEXT/Dockerfile"

Check warning on line 176 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L172-L176

Added lines #L172 - L176 were not covered by tests

# Add symbolic link to read-only mounted config.json
mkdir -p /root/.docker
ln -s /tmp/.docker/config/config.json /root/.docker/config.json

Check warning on line 180 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L178-L180

Added lines #L178 - L180 were not covered by tests

docker build -t %s "$TEMP_CONTEXT"

Check warning on line 182 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L182

Added line #L182 was not covered by tests

while true; do
if docker push %s; then
echo "Upload complete"
# Cleanup: Remove the temporary directory
rm -rf "$TEMP_CONTEXT"
# Remove the file to prevent repeated builds
rm "$FILE_PATH"
PUSH_SUCCEEDED=true
# Signal completion
touch /tmp/upload_complete
exit 0
else
echo "Push failed, retrying in 30 seconds..."
sleep 30
fi
done
fi

Check warning on line 200 in pkg/tuning/preset-tuning.go

View check run for this annotation

Codecov / codecov/patch

pkg/tuning/preset-tuning.go#L184-L200

Added lines #L184 - L200 were not covered by tests
fi
sleep 10 # Check every 10 seconds
done`, outputDir, image, image)
Expand Down Expand Up @@ -369,7 +385,6 @@
Command: []string{"/bin/sh", "-c"},
Args: []string{dockerSidecarScriptPushImage(outputDir, image)},
}

volume, volumeMount := utils.ConfigImagePushSecretVolume(imagePushSecret)
return sidecarContainer, volume, volumeMount
}
Expand Down
21 changes: 14 additions & 7 deletions pkg/utils/common-preset.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,20 @@
volume := corev1.Volume{
Name: "docker-config",
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: imagePushSecret,
Items: []corev1.KeyToPath{
Projected: &corev1.ProjectedVolumeSource{
Sources: []corev1.VolumeProjection{

Check warning on line 35 in pkg/utils/common-preset.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common-preset.go#L34-L35

Added lines #L34 - L35 were not covered by tests
{
Key: ".dockerconfigjson",
Path: "config.json",
Secret: &corev1.SecretProjection{
LocalObjectReference: corev1.LocalObjectReference{
Name: imagePushSecret,
},
Items: []corev1.KeyToPath{

Check warning on line 41 in pkg/utils/common-preset.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common-preset.go#L37-L41

Added lines #L37 - L41 were not covered by tests
{
Key: ".dockerconfigjson",
Path: "config.json",
},
},
},

Check warning on line 47 in pkg/utils/common-preset.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common-preset.go#L43-L47

Added lines #L43 - L47 were not covered by tests
},
},
},
Expand All @@ -45,8 +53,7 @@

volumeMount := corev1.VolumeMount{
Name: "docker-config",
MountPath: "/root/.docker/config.json",
SubPath: "config.json", // Mount only the config.json file
MountPath: "/tmp/.docker/config",

Check warning on line 56 in pkg/utils/common-preset.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common-preset.go#L56

Added line #L56 was not covered by tests
}

return volume, volumeMount
Expand Down
16 changes: 16 additions & 0 deletions pkg/utils/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"knative.dev/pkg/apis"
"os"
"sigs.k8s.io/controller-runtime/pkg/client"
"strings"
)

func Contains(s []string, e string) bool {
Expand Down Expand Up @@ -172,3 +173,18 @@
}
return ""
}

func ExtractAndValidateRepoName(image string) error {

Check warning on line 177 in pkg/utils/common.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common.go#L177

Added line #L177 was not covered by tests
// Extract repository name (part after the last / and before the colon :)
// For example given image: modelsregistry.azurecr.io/ADAPTER_HERE:0.0.1
parts := strings.Split(image, "/")
lastPart := parts[len(parts)-1] // Extracts "ADAPTER_HERE:0.0.1"
repoName := strings.Split(lastPart, ":")[0] // Extracts "ADAPTER_HERE"

Check warning on line 182 in pkg/utils/common.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common.go#L180-L182

Added lines #L180 - L182 were not covered by tests

// Check if repository name is lowercase
if repoName != strings.ToLower(repoName) {
return fmt.Errorf("Repository name must be lowercase")

Check warning on line 186 in pkg/utils/common.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common.go#L185-L186

Added lines #L185 - L186 were not covered by tests
}

return nil

Check warning on line 189 in pkg/utils/common.go

View check run for this annotation

Codecov / codecov/patch

pkg/utils/common.go#L189

Added line #L189 was not covered by tests
}
9 changes: 7 additions & 2 deletions test/e2e/preset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,13 @@ func loadTestEnvVars() {
runLlama13B = false
}

// Required for Llama models
aiModelsRegistry = utils.GetEnv("AI_MODELS_REGISTRY")
aiModelsRegistrySecret = utils.GetEnv("AI_MODELS_REGISTRY_SECRET")
// Currently required for uploading fine-tuning results
e2eACRSecret = utils.GetEnv("E2E_ACR_REGISTRY_SECRET")
supportedModelsYamlPath = utils.GetEnv("SUPPORTED_MODELS_YAML_PATH")
azureClusterName = utils.GetEnv("AZURE_CLUSTER_NAME")
}

func loadModelVersions() {
Expand Down Expand Up @@ -223,7 +227,7 @@ func createPhi3TuningWorkspaceWithPresetPublicMode(configMapName string, numOfNo
workspaceObj = utils.GenerateE2ETuningWorkspaceManifest(uniqueID, namespaceName, "",
fullDatasetImageName, outputRegistryUrl, numOfNode, "Standard_NC6s_v3", &metav1.LabelSelector{
MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-tuning-falcon"},
}, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, []string{aiModelsRegistrySecret}, configMapName)
}, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, []string{e2eACRSecret}, configMapName)

createAndValidateWorkspace(workspaceObj)
})
Expand Down Expand Up @@ -542,6 +546,7 @@ func deleteWorkspace(workspaceObj *kaitov1alpha1.Workspace) error {
var runLlama13B bool
var aiModelsRegistry string
var aiModelsRegistrySecret string
var e2eACRSecret string
var supportedModelsYamlPath string
var modelInfo map[string]string
var azureClusterName string
Expand Down Expand Up @@ -707,7 +712,7 @@ var _ = Describe("Workspace Preset", func() {

It("should create a workspace for tuning successfully", func() {
numOfNode := 1
err := copySecretToNamespace(aiModelsRegistrySecret, namespaceName)
err := copySecretToNamespace(e2eACRSecret, namespaceName)
if err != nil {
log.Fatalf("Error copying secret: %v", err)
}
Expand Down
Loading