Skip to content

Commit

Permalink
fix: Use Projected Volume and Keep Docker Sidecar Alive for Data Retr…
Browse files Browse the repository at this point in the history
…ieval (#552)

**Reason for Change**:
- Keep the docker sidecar container alive so incase ACR push fails we
can still exec into the container to retrieve completed tuning job files
in the /mnt/results folder.

- Perform validation check for output images that are uppercase

- Use projected volume for mounting docker config secrets - mount at
directory location /root/.docker/config

- Add secret for E2E Temp ACR
  • Loading branch information
ishaansehgal99 authored Aug 15, 2024
1 parent 868efc3 commit 8b7aabf
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 41 deletions.
26 changes: 24 additions & 2 deletions .github/workflows/e2e-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,29 @@ jobs:
AZURE_CLUSTER_NAME: ${{ env.CLUSTER_NAME }}
REGISTRY: ${{ env.REGISTRY }}
VERSION: ${{ env.VERSION }}

- name: Add Secret Credentials

# Retrieve E2E ACR credentials and create Kubernetes secret
- name: Set up E2E ACR Credentials and Secret
shell: bash
run: |
# Retrieve the ACR username and password
ACR_USERNAME=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "username" -o tsv)
ACR_PASSWORD=$(az acr credential show --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.CLUSTER_NAME }} --query "passwords[0].value" -o tsv)
# Ensure credentials were retrieved successfully
if [ -z "$ACR_USERNAME" ] || [ -z "$ACR_PASSWORD" ]; then
echo "Failed to retrieve ACR credentials"
exit 1
fi
# Create the Kubernetes secret with the retrieved credentials
kubectl create secret docker-registry ${{ env.CLUSTER_NAME }}-acr-secret \
--docker-server=${{ env.CLUSTER_NAME }}.azurecr.io \
--docker-username=${ACR_USERNAME} \
--docker-password=${ACR_PASSWORD}
# Add Private-Hosted ACR secret for private models like llama
- name: Add Private-Hosted ACR Secret Credentials
run: |
kubectl create secret docker-registry ${{ secrets.E2E_AMRT_SECRET_NAME }} \
--docker-server=${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io \
Expand All @@ -218,6 +239,7 @@ jobs:
AI_MODELS_REGISTRY: ${{ secrets.E2E_ACR_AMRT_USERNAME }}.azurecr.io
AI_MODELS_REGISTRY_SECRET: ${{ secrets.E2E_AMRT_SECRET_NAME }}
E2E_ACR_REGISTRY: ${{ env.CLUSTER_NAME }}.azurecr.io
E2E_ACR_REGISTRY_SECRET: ${{ env.CLUSTER_NAME }}-acr-secret

- name: Cleanup e2e resources
if: ${{ always() }}
Expand Down
12 changes: 12 additions & 0 deletions api/v1alpha1/workspace_validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,12 @@ func (r *DataSource) validateCreate() (errs *apis.FieldError) {
re := regexp.MustCompile(`^(.+/[^:/]+):([^:/]+)$`)
if !re.MatchString(r.Image) {
errs = errs.Also(apis.ErrInvalidValue("Invalid image format, require full input image URL", "Image"))
} else {
// Executes if image is of correct format
err := utils.ExtractAndValidateRepoName(r.Image)
if err != nil {
errs = errs.Also(apis.ErrInvalidValue(err.Error(), "Image"))
}
}
sourcesSpecified++
}
Expand Down Expand Up @@ -271,6 +277,12 @@ func (r *DataDestination) validateCreate() (errs *apis.FieldError) {
re := regexp.MustCompile(`^(.+/[^:/]+):([^:/]+)$`)
if !re.MatchString(r.Image) {
errs = errs.Also(apis.ErrInvalidValue("Invalid image format, require full output image URL", "Image"))
} else {
// Executes if image is of correct format
err := utils.ExtractAndValidateRepoName(r.Image)
if err != nil {
errs = errs.Also(apis.ErrInvalidValue(err.Error(), "Image"))
}
}
// Cloud Provider requires credentials to push image
if r.ImagePushSecret == "" {
Expand Down
22 changes: 22 additions & 0 deletions api/v1alpha1/workspace_validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1068,6 +1068,28 @@ func TestTuningSpecValidateCreate(t *testing.T) {
wantErr: true,
errFields: []string{"Method"},
},
{
name: "Invalid Input Source Casing",
tuningSpec: &TuningSpec{
Input: &DataSource{Name: "valid-input", Image: "AZURE_ACR.azurecr.io/INPUT:0.0.0"},
Output: &DataDestination{Image: "AZURE_ACR.azurecr.io/output:0.0.0", ImagePushSecret: "secret"},
Preset: &PresetSpec{PresetMeta: PresetMeta{Name: ModelName("test-validation")}},
Method: TuningMethodLora,
},
wantErr: true,
errFields: []string{"Image"},
},
{
name: "Invalid Output Destination Casing",
tuningSpec: &TuningSpec{
Input: &DataSource{Name: "valid-input", Image: "AZURE_ACR.azurecr.io/input:0.0.0"},
Output: &DataDestination{Image: "AZURE_ACR.azurecr.io/OUTPUT:0.0.0", ImagePushSecret: "secret"},
Preset: &PresetSpec{PresetMeta: PresetMeta{Name: ModelName("test-validation")}},
Method: TuningMethodLora,
},
wantErr: true,
errFields: []string{"Image"},
},
}

for _, tt := range tests {
Expand Down
11 changes: 7 additions & 4 deletions pkg/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,19 @@ func (c *WorkspaceReconciler) addOrUpdateWorkspace(ctx context.Context, wObj *ka
if job.Status.Succeeded > 0 {
if updateErr := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeSucceeded, metav1.ConditionTrue,
"workspaceSucceeded", "workspace succeeds"); updateErr != nil {
klog.ErrorS(err, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, err
klog.ErrorS(updateErr, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, updateErr
}
} else { // The job is still running
if updateErr := c.updateStatusConditionIfNotMatch(ctx, wObj, kaitov1alpha1.WorkspaceConditionTypeSucceeded, metav1.ConditionFalse,
"workspacePending", "workspace has not completed"); updateErr != nil {
klog.ErrorS(err, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, err
klog.ErrorS(updateErr, "failed to update workspace status", "workspace", klog.KObj(wObj))
return reconcile.Result{}, updateErr
}
}
} else {
klog.ErrorS(err, "failed to get job resource", "workspace", klog.KObj(wObj))
return reconcile.Result{}, err
}
} else if wObj.Inference != nil {
if err := c.ensureService(ctx, wObj); err != nil {
Expand Down
67 changes: 41 additions & 26 deletions pkg/tuning/preset-tuning.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,34 +154,50 @@ while ! docker info > /dev/null 2>&1; do
done
echo 'Docker daemon started'
PUSH_SUCCEEDED=false
while true; do
FILE_PATH=$(find %s -name 'fine_tuning_completed.txt')
if [ ! -z "$FILE_PATH" ]; then
echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH"
PARENT_DIR=$(dirname "$FILE_PATH")
echo "Parent directory is $PARENT_DIR"
TEMP_CONTEXT=$(mktemp -d)
cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json"
cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors"
# Create a minimal Dockerfile
echo 'FROM busybox:latest
RUN mkdir -p /data
ADD adapter_config.json /data/
ADD adapter_model.safetensors /data/' > "$TEMP_CONTEXT/Dockerfile"
docker build -t %s "$TEMP_CONTEXT"
docker push %s
# Cleanup: Remove the temporary directory
rm -rf "$TEMP_CONTEXT"
# Remove the file to prevent repeated builds
rm "$FILE_PATH"
echo "Upload complete"
exit 0
if [ "$PUSH_SUCCEEDED" = false ]; then
echo "FOUND TRAINING COMPLETED FILE at $FILE_PATH"
PARENT_DIR=$(dirname "$FILE_PATH")
echo "Parent directory is $PARENT_DIR"
TEMP_CONTEXT=$(mktemp -d)
cp "$PARENT_DIR/adapter_config.json" "$TEMP_CONTEXT/adapter_config.json"
cp -r "$PARENT_DIR/adapter_model.safetensors" "$TEMP_CONTEXT/adapter_model.safetensors"
# Create a minimal Dockerfile
echo 'FROM busybox:latest
RUN mkdir -p /data
ADD adapter_config.json /data/
ADD adapter_model.safetensors /data/' > "$TEMP_CONTEXT/Dockerfile"
# Add symbolic link to read-only mounted config.json
mkdir -p /root/.docker
ln -s /tmp/.docker/config/config.json /root/.docker/config.json
docker build -t %s "$TEMP_CONTEXT"
while true; do
if docker push %s; then
echo "Upload complete"
# Cleanup: Remove the temporary directory
rm -rf "$TEMP_CONTEXT"
# Remove the file to prevent repeated builds
rm "$FILE_PATH"
PUSH_SUCCEEDED=true
# Signal completion
touch /tmp/upload_complete
exit 0
else
echo "Push failed, retrying in 30 seconds..."
sleep 30
fi
done
fi
fi
sleep 10 # Check every 10 seconds
done`, outputDir, image, image)
Expand Down Expand Up @@ -369,7 +385,6 @@ func handleImageDataDestination(ctx context.Context, outputDir, image, imagePush
Command: []string{"/bin/sh", "-c"},
Args: []string{dockerSidecarScriptPushImage(outputDir, image)},
}

volume, volumeMount := utils.ConfigImagePushSecretVolume(imagePushSecret)
return sidecarContainer, volume, volumeMount
}
Expand Down
21 changes: 14 additions & 7 deletions pkg/utils/common-preset.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,20 @@ func ConfigImagePushSecretVolume(imagePushSecret string) (corev1.Volume, corev1.
volume := corev1.Volume{
Name: "docker-config",
VolumeSource: corev1.VolumeSource{
Secret: &corev1.SecretVolumeSource{
SecretName: imagePushSecret,
Items: []corev1.KeyToPath{
Projected: &corev1.ProjectedVolumeSource{
Sources: []corev1.VolumeProjection{
{
Key: ".dockerconfigjson",
Path: "config.json",
Secret: &corev1.SecretProjection{
LocalObjectReference: corev1.LocalObjectReference{
Name: imagePushSecret,
},
Items: []corev1.KeyToPath{
{
Key: ".dockerconfigjson",
Path: "config.json",
},
},
},
},
},
},
Expand All @@ -45,8 +53,7 @@ func ConfigImagePushSecretVolume(imagePushSecret string) (corev1.Volume, corev1.

volumeMount := corev1.VolumeMount{
Name: "docker-config",
MountPath: "/root/.docker/config.json",
SubPath: "config.json", // Mount only the config.json file
MountPath: "/tmp/.docker/config",
}

return volume, volumeMount
Expand Down
16 changes: 16 additions & 0 deletions pkg/utils/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"knative.dev/pkg/apis"
"os"
"sigs.k8s.io/controller-runtime/pkg/client"
"strings"
)

func Contains(s []string, e string) bool {
Expand Down Expand Up @@ -172,3 +173,18 @@ func GetPerNodeGPUCountFromNodes(nodeList *v1.NodeList) string {
}
return ""
}

func ExtractAndValidateRepoName(image string) error {
// Extract repository name (part after the last / and before the colon :)
// For example given image: modelsregistry.azurecr.io/ADAPTER_HERE:0.0.1
parts := strings.Split(image, "/")
lastPart := parts[len(parts)-1] // Extracts "ADAPTER_HERE:0.0.1"
repoName := strings.Split(lastPart, ":")[0] // Extracts "ADAPTER_HERE"

// Check if repository name is lowercase
if repoName != strings.ToLower(repoName) {
return fmt.Errorf("Repository name must be lowercase")
}

return nil
}
9 changes: 7 additions & 2 deletions test/e2e/preset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,13 @@ func loadTestEnvVars() {
runLlama13B = false
}

// Required for Llama models
aiModelsRegistry = utils.GetEnv("AI_MODELS_REGISTRY")
aiModelsRegistrySecret = utils.GetEnv("AI_MODELS_REGISTRY_SECRET")
// Currently required for uploading fine-tuning results
e2eACRSecret = utils.GetEnv("E2E_ACR_REGISTRY_SECRET")
supportedModelsYamlPath = utils.GetEnv("SUPPORTED_MODELS_YAML_PATH")
azureClusterName = utils.GetEnv("AZURE_CLUSTER_NAME")
}

func loadModelVersions() {
Expand Down Expand Up @@ -223,7 +227,7 @@ func createPhi3TuningWorkspaceWithPresetPublicMode(configMapName string, numOfNo
workspaceObj = utils.GenerateE2ETuningWorkspaceManifest(uniqueID, namespaceName, "",
fullDatasetImageName, outputRegistryUrl, numOfNode, "Standard_NC6s_v3", &metav1.LabelSelector{
MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-tuning-falcon"},
}, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, []string{aiModelsRegistrySecret}, configMapName)
}, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, []string{e2eACRSecret}, configMapName)

createAndValidateWorkspace(workspaceObj)
})
Expand Down Expand Up @@ -542,6 +546,7 @@ func deleteWorkspace(workspaceObj *kaitov1alpha1.Workspace) error {
var runLlama13B bool
var aiModelsRegistry string
var aiModelsRegistrySecret string
var e2eACRSecret string
var supportedModelsYamlPath string
var modelInfo map[string]string
var azureClusterName string
Expand Down Expand Up @@ -707,7 +712,7 @@ var _ = Describe("Workspace Preset", func() {

It("should create a workspace for tuning successfully", func() {
numOfNode := 1
err := copySecretToNamespace(aiModelsRegistrySecret, namespaceName)
err := copySecretToNamespace(e2eACRSecret, namespaceName)
if err != nil {
log.Fatalf("Error copying secret: %v", err)
}
Expand Down

0 comments on commit 8b7aabf

Please sign in to comment.