diff --git a/.github/workflows/preset-image-build-1ES.yml b/.github/workflows/preset-image-build-1ES.yml index 51bdfb3c3..be4475025 100644 --- a/.github/workflows/preset-image-build-1ES.yml +++ b/.github/workflows/preset-image-build-1ES.yml @@ -88,7 +88,7 @@ jobs: fail-fast: false matrix: model: ${{fromJson(needs.determine-models.outputs.matrix)}} - max-parallel: 1 + max-parallel: 10 steps: - name: Checkout uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/.github/workflows/preset-image-build.yml b/.github/workflows/preset-image-build.yml index 72f651683..45364c9a8 100644 --- a/.github/workflows/preset-image-build.yml +++ b/.github/workflows/preset-image-build.yml @@ -81,7 +81,7 @@ jobs: fail-fast: false matrix: model: ${{fromJson(needs.determine-models.outputs.matrix)}} - max-parallel: 1 + max-parallel: 10 steps: - name: Checkout uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 diff --git a/examples/fine-tuning/kaito_workspace_tuning_phi_3.yaml b/examples/fine-tuning/kaito_workspace_tuning_phi_3.yaml index 71b57bce2..c83c6f67b 100644 --- a/examples/fine-tuning/kaito_workspace_tuning_phi_3.yaml +++ b/examples/fine-tuning/kaito_workspace_tuning_phi_3.yaml @@ -15,5 +15,5 @@ tuning: urls: - "https://huggingface.co/datasets/philschmid/dolly-15k-oai-style/resolve/main/data/train-00000-of-00001-54e3756291ca09c6.parquet?download=true" output: - image: "ACR_REPO_HERE.azurecr.io/ADAPTER_HERE:0.0.1" # Tuning Output ACR Path + image: "ACR_REPO_HERE.azurecr.io/IMAGE_NAME_HERE:0.0.1" # Tuning Output ACR Path imagePushSecret: ACR_REGISTRY_SECRET_HERE diff --git a/pkg/controllers/workspace_controller.go b/pkg/controllers/workspace_controller.go index cb9cdf034..c717a51c8 100644 --- a/pkg/controllers/workspace_controller.go +++ b/pkg/controllers/workspace_controller.go @@ -470,6 +470,17 @@ func (c *WorkspaceReconciler) ensureNodePlugins(ctx context.Context, wObj *kaito } } +// getPresetName returns the preset name from wObj if available +func getPresetName(wObj *kaitov1alpha1.Workspace) string { + if wObj.Inference != nil && wObj.Inference.Preset != nil { + return string(wObj.Inference.Preset.Name) + } + if wObj.Tuning != nil && wObj.Tuning.Preset != nil { + return string(wObj.Tuning.Preset.Name) + } + return "" +} + func (c *WorkspaceReconciler) ensureService(ctx context.Context, wObj *kaitov1alpha1.Workspace) error { serviceType := corev1.ServiceTypeClusterIP wAnnotation := wObj.GetAnnotations() @@ -491,18 +502,15 @@ func (c *WorkspaceReconciler) ensureService(ctx context.Context, wObj *kaitov1al return nil } - if wObj.Inference != nil && wObj.Inference.Preset != nil { - presetName := string(wObj.Inference.Preset.Name) + if presetName := getPresetName(wObj); presetName != "" { model := plugin.KaitoModelRegister.MustGet(presetName) serviceObj := resources.GenerateServiceManifest(ctx, wObj, serviceType, model.SupportDistributedInference()) - err = resources.CreateResource(ctx, serviceObj, c.Client) - if err != nil { + if err := resources.CreateResource(ctx, serviceObj, c.Client); err != nil { return err } if model.SupportDistributedInference() { headlessService := resources.GenerateHeadlessServiceManifest(ctx, wObj) - err = resources.CreateResource(ctx, headlessService, c.Client) - if err != nil { + if err := resources.CreateResource(ctx, headlessService, c.Client); err != nil { return err } } diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go index 392fa4628..3dc031f60 100644 --- a/presets/models/falcon/model.go +++ b/presets/models/falcon/model.go @@ -75,7 +75,7 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam { ModelFamilyName: "Falcon", ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic), DiskStorageRequirement: "50Gi", - GPUCountRequirement: "2", + GPUCountRequirement: "1", TotalGPUMemoryRequirement: "16Gi", PerGPUMemoryRequirement: "16Gi", TorchRunParams: tuning.DefaultAccelerateParams, diff --git a/test/e2e/preset_test.go b/test/e2e/preset_test.go index e70e9eb55..ae2a30b7c 100644 --- a/test/e2e/preset_test.go +++ b/test/e2e/preset_test.go @@ -5,11 +5,19 @@ package e2e import ( "fmt" + "log" "math/rand" "os" + "path/filepath" "strconv" + "strings" "time" + batchv1 "k8s.io/api/batch/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "github.com/aws/karpenter-core/pkg/apis/v1alpha5" kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1" "github.com/azure/kaito/test/e2e/utils" @@ -163,6 +171,65 @@ func createCustomWorkspaceWithPresetCustomMode(imageName string, numOfNode int) return workspaceObj } +func createPhi3WorkspaceWithPresetPublicMode(numOfNode int) *kaitov1alpha1.Workspace { + workspaceObj := &kaitov1alpha1.Workspace{} + By("Creating a workspace CR with Phi-3-mini-128k-instruct preset public mode", func() { + uniqueID := fmt.Sprint("preset-", rand.Intn(1000)) + workspaceObj = utils.GenerateInferenceWorkspaceManifest(uniqueID, namespaceName, "", + numOfNode, "Standard_NC6s_v3", &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-phi-3-mini-128k-instruct"}, + }, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj +} + +func createCustomTuningConfigMapForE2E() *v1.ConfigMap { + configMap := utils.GenerateE2ETuningConfigMapManifest(namespaceName) + + By("Creating a workspace Tuning CR with Falcon-7B preset private mode", func() { + createAndValidateConfigMap(configMap) + }) + + return configMap +} + +func createAndValidateConfigMap(configMap *v1.ConfigMap) { + By("Creating ConfigMap", func() { + Eventually(func() error { + return TestingCluster.KubeClient.Create(ctx, configMap, &client.CreateOptions{}) + }, utils.PollTimeout, utils.PollInterval). + Should(Succeed(), "Failed to create ConfigMap %s", configMap.Name) + + By("Validating ConfigMap creation", func() { + err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: configMap.Namespace, + Name: configMap.Name, + }, configMap, &client.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + }) + }) +} + +func createPhi3TuningWorkspaceWithPresetPublicMode(configMapName string, numOfNode int) (*kaitov1alpha1.Workspace, string) { + workspaceObj := &kaitov1alpha1.Workspace{} + e2eOutputImageName := fmt.Sprintf("adapter-%s-e2e-test", PresetPhi3Mini128kModel) + e2eOutputImageTag := utils.GenerateRandomString() + var uniqueID string + By("Creating a workspace Tuning CR with Phi-3 preset private mode", func() { + uniqueID = fmt.Sprint("preset-", rand.Intn(1000)) + outputRegistryUrl := fmt.Sprintf("%s.azurecr.io/%s:%s", azureClusterName, e2eOutputImageName, e2eOutputImageTag) + workspaceObj = utils.GenerateE2ETuningWorkspaceManifest(uniqueID, namespaceName, "", + outputRegistryUrl, numOfNode, "Standard_NC6s_v3", &metav1.LabelSelector{ + MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-tuning-falcon"}, + }, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, []string{aiModelsRegistrySecret}, configMapName) + + createAndValidateWorkspace(workspaceObj) + }) + return workspaceObj, uniqueID +} + func createAndValidateWorkspace(workspaceObj *kaitov1alpha1.Workspace) { By("Creating workspace", func() { Eventually(func() error { @@ -180,6 +247,31 @@ func createAndValidateWorkspace(workspaceObj *kaitov1alpha1.Workspace) { }) } +func copySecretToNamespace(secretName, targetNamespace string) error { + originalNamespace := "default" + originalSecret := &v1.Secret{} + + // Fetch the original secret from the default namespace + err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: originalNamespace, + Name: secretName, + }, originalSecret) + if err != nil { + return fmt.Errorf("failed to get secret %s in namespace %s: %v", secretName, originalNamespace, err) + } + + // Create a copy of the secret for the target namespace + newSecret := utils.CopySecret(originalSecret, targetNamespace) + + // Create the new secret in the target namespace + err = TestingCluster.KubeClient.Create(ctx, newSecret) + if err != nil { + return fmt.Errorf("failed to create secret %s in namespace %s: %v", secretName, targetNamespace, err) + } + + return nil +} + func getAllValidMachines(workspaceObj *kaitov1alpha1.Workspace) (*v1alpha5.MachineList, error) { machineList := &v1alpha5.MachineList{} ls := labels.Set{ @@ -320,6 +412,91 @@ func validateInferenceResource(workspaceObj *kaitov1alpha1.Workspace, expectedRe }) } +// Logic to validate tuning deployment +func validateTuningResource(workspaceObj *kaitov1alpha1.Workspace) { + By("Checking the tuning resource", func() { + Eventually(func() bool { + var err error + var jobFailed, jobSucceeded int32 + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: workspaceObj.Name, + Namespace: workspaceObj.Namespace, + }, + } + err = TestingCluster.KubeClient.Get(ctx, client.ObjectKey{ + Namespace: workspaceObj.Namespace, + Name: workspaceObj.Name, + }, job) + + if err != nil { + GinkgoWriter.Printf("Error fetching resource: %v\n", err) + return false + } + + jobFailed = job.Status.Failed + jobSucceeded = job.Status.Succeeded + + if jobFailed > 0 { + GinkgoWriter.Printf("Job '%s' is in a failed state.\n", workspaceObj.Name) + return false + } + + if jobSucceeded > 0 { + return true + } + + return false + }, 30*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for Tuning resource to be ready") + }) +} + +func validateACRTuningResultsUploaded(workspaceObj *kaitov1alpha1.Workspace, jobName string) { + var config *rest.Config + var err error + + if os.Getenv("KUBERNETES_SERVICE_HOST") != "" && os.Getenv("KUBERNETES_SERVICE_PORT") != "" { + config, err = rest.InClusterConfig() + if err != nil { + log.Fatalf("Failed to get in-cluster config: %v", err) + } + } else { + // Use kubeconfig file for local development + kubeconfig := filepath.Join(os.Getenv("HOME"), ".kube", "config") + config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + log.Fatalf("Failed to load kubeconfig: %v", err) + } + } + + coreClient, err := kubernetes.NewForConfig(config) + if err != nil { + log.Fatalf("Failed to create core client: %v", err) + } + namespace := workspaceObj.Namespace + podName, err := utils.GetPodNameForJob(coreClient, namespace, jobName) + if err != nil { + log.Fatalf("Failed to get pod name for job %s: %v", jobName, err) + } + + for { + logs, err := utils.GetPodLogs(coreClient, namespace, podName, "docker-sidecar") + if err != nil { + log.Printf("Failed to get logs from pod %s: %v", podName, err) + time.Sleep(10 * time.Second) + continue + } + + if strings.Contains(logs, "Upload complete") { + fmt.Println("Upload complete") + break + } + + time.Sleep(10 * time.Second) // Poll every 10 seconds + } +} + // Logic to validate workspace readiness func validateWorkspaceReadiness(workspaceObj *kaitov1alpha1.Workspace) { By("Checking the workspace status is ready", func() { @@ -383,6 +560,7 @@ var aiModelsRegistry string var aiModelsRegistrySecret string var supportedModelsYamlPath string var modelInfo map[string]string +var azureClusterName string var _ = Describe("Workspace Preset", func() { BeforeEach(func() { @@ -515,4 +693,48 @@ var _ = Describe("Workspace Preset", func() { validateWorkspaceReadiness(workspaceObj) }) + It("should create a Phi-3-mini-128k-instruct workspace with preset public mode successfully", func() { + numOfNode := 1 + workspaceObj := createPhi3WorkspaceWithPresetPublicMode(numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateMachineCreation(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + validateAssociatedService(workspaceObj) + + validateInferenceResource(workspaceObj, int32(numOfNode), false) + + validateWorkspaceReadiness(workspaceObj) + }) + + It("should create a workspace for tuning successfully", func() { + numOfNode := 1 + err := copySecretToNamespace(aiModelsRegistrySecret, namespaceName) + if err != nil { + log.Fatalf("Error copying secret: %v", err) + } + configMap := createCustomTuningConfigMapForE2E() + workspaceObj, jobName := createPhi3TuningWorkspaceWithPresetPublicMode(configMap.Name, numOfNode) + + defer cleanupResources(workspaceObj) + time.Sleep(30 * time.Second) + + validateMachineCreation(workspaceObj, numOfNode) + validateResourceStatus(workspaceObj) + + time.Sleep(30 * time.Second) + + // TODO: Need to check if tuning job uploaded to ACR + validateTuningResource(workspaceObj) + + validateACRTuningResultsUploaded(workspaceObj, jobName) + + validateWorkspaceReadiness(workspaceObj) + }) + }) diff --git a/test/e2e/utils/utils.go b/test/e2e/utils/utils.go index 91d45bf47..c67a58063 100644 --- a/test/e2e/utils/utils.go +++ b/test/e2e/utils/utils.go @@ -24,6 +24,7 @@ import ( const ( InferenceModeCustomTemplate kaitov1alpha1.ModelImageAccessMode = "customTemplate" + ExampleDatasetURL = "https://huggingface.co/datasets/philschmid/dolly-15k-oai-style/resolve/main/data/train-00000-of-00001-54e3756291ca09c6.parquet?download=true" ) var ( @@ -210,6 +211,102 @@ func GenerateTuningWorkspaceManifest(name, namespace, imageName string, resource return workspace } +func GenerateE2ETuningWorkspaceManifest(name, namespace, imageName, outputRegistry string, + resourceCount int, instanceType string, labelSelector *metav1.LabelSelector, + preferredNodes []string, presetName kaitov1alpha1.ModelName, accessMode kaitov1alpha1.ModelImageAccessMode, + imagePullSecret []string, customConfigMapName string) *kaitov1alpha1.Workspace { + workspace := &kaitov1alpha1.Workspace{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Resource: kaitov1alpha1.ResourceSpec{ + Count: lo.ToPtr(resourceCount), + InstanceType: instanceType, + LabelSelector: labelSelector, + PreferredNodes: preferredNodes, + }, + } + + var workspaceTuning kaitov1alpha1.TuningSpec + if accessMode == kaitov1alpha1.ModelImageAccessModePublic || + accessMode == kaitov1alpha1.ModelImageAccessModePrivate { + workspaceTuning.Preset = &kaitov1alpha1.PresetSpec{ + PresetMeta: kaitov1alpha1.PresetMeta{ + Name: presetName, + AccessMode: accessMode, + }, + PresetOptions: kaitov1alpha1.PresetOptions{ + Image: imageName, + ImagePullSecrets: imagePullSecret, + }, + } + } + + workspace.Tuning = &workspaceTuning + workspace.Tuning.Method = kaitov1alpha1.TuningMethodQLora + workspace.Tuning.Input = &kaitov1alpha1.DataSource{ + URLs: []string{ExampleDatasetURL}, + } + workspace.Tuning.Output = &kaitov1alpha1.DataDestination{ + Image: outputRegistry, + ImagePushSecret: imagePullSecret[0], + } + + if customConfigMapName != "" { + workspace.Tuning.Config = customConfigMapName + } + + return workspace +} + +// GenerateE2ETuningConfigMapManifest generates a ConfigMap manifest for E2E tuning. +func GenerateE2ETuningConfigMapManifest(namespace string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "ConfigMap", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "e2e-qlora-params-template", + Namespace: namespace, // Same as workspace namespace + }, + Data: map[string]string{ + "training_config.yaml": `training_config: + ModelConfig: + torch_dtype: "bfloat16" + local_files_only: true + device_map: "auto" + + QuantizationConfig: + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "bfloat16" + bnb_4bit_use_double_quant: true + + LoraConfig: + r: 8 + lora_alpha: 8 + lora_dropout: 0.0 + target_modules: ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"] + + TrainingArguments: + output_dir: "/mnt/results" + ddp_find_unused_parameters: false + save_strategy: "epoch" + per_device_train_batch_size: 1 + max_steps: 2 # Adding this line to limit training to 2 steps + + DataCollator: + mlm: true + + DatasetConfig: + shuffle_dataset: true + train_test_split: 1`, + }, + } +} + func GeneratePodTemplate(name, namespace, image string, labels map[string]string) *corev1.PodTemplateSpec { return &corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{