feat: Add Min GPU Memory Requirement (#443)

**Reason for Change**: 1. Add interface field `TuningMethodGPUMemory` - Min GPU memory per tuning method (batch size 1). 2. Fix bug(s) in tuning ConfigMap creation
kaito-project · May 30, 2024 · 76a966b · 76a966b
1 parent e24e8b3
commit 76a966b
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 28 deletions.
diff --git a/pkg/model/interface.go b/pkg/model/interface.go
@@ -15,22 +15,21 @@ type Model interface {
 
 // PresetParam defines the preset inference parameters for a model.
 type PresetParam struct {
-	ModelFamilyName           string            // The name of the model family.
-	ImageAccessMode           string            // Defines where the Image is Public or Private.
-	DiskStorageRequirement    string            // Disk storage requirements for the model.
-	GPUCountRequirement       string            // Number of GPUs required for the Preset.
-	TotalGPUMemoryRequirement string            // Total GPU memory required for the Preset.
-	PerGPUMemoryRequirement   string            // GPU memory required per GPU.
-	TorchRunParams            map[string]string // Parameters for configuring the torchrun command.
-	TorchRunRdzvParams        map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
-	// BaseCommand is the initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
-	BaseCommand    string
-	ModelRunParams map[string]string // Parameters for running the model training/inference.
+	ModelFamilyName               string            // The name of the model family.
+	ImageAccessMode               string            // Defines where the Image is Public or Private.
+	DiskStorageRequirement        string            // Disk storage requirements for the model.
+	GPUCountRequirement           string            // Number of GPUs required for the Preset. Used for inference.
+	TotalGPUMemoryRequirement     string            // Total GPU memory required for the Preset. Used for inference.
+	PerGPUMemoryRequirement       string            // GPU memory required per GPU. Used for inference.
+	TuningPerGPUMemoryRequirement map[string]int    // Min GPU memory per tuning method (batch size 1). Used for tuning.
+	TorchRunParams                map[string]string // Parameters for configuring the torchrun command.
+	TorchRunRdzvParams            map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
+	BaseCommand                   string            // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
+	ModelRunParams                map[string]string // Parameters for running the model training/inference.
 	// ReadinessTimeout defines the maximum duration for creating the workload.
 	// This timeout accommodates the size of the image, ensuring pull completion
 	// even under slower network conditions or unforeseen delays.
 	ReadinessTimeout time.Duration
-	// WorldSize defines the number of processes required for distributed inference.
-	WorldSize int
-	Tag       string // The model image tag
+	WorldSize        int    // Defines the number of processes required for distributed inference.
+	Tag              string // The model image tag
 }
diff --git a/pkg/tuning/preset-tuning.go b/pkg/tuning/preset-tuning.go
@@ -85,11 +85,29 @@ func GetDataSrcImageInfo(ctx context.Context, wObj *kaitov1alpha1.Workspace) (st
 	return wObj.Tuning.Input.Image, imagePullSecretRefs
 }
 
+// EnsureTuningConfigMap handles two scenarios:
+// 1. Custom config template specified:
+//   - Check if it exists in the target namespace.
+//   - If not, check the release namespace and copy it to the target namespace if found.
+//
+// 2. No custom config template specified:
+//   - Use the default config template based on the tuning method (e.g., LoRA or QLoRA).
+//   - Check if it exists in the target namespace.
+//   - If not, check the release namespace and copy it to the target namespace if found.
 func EnsureTuningConfigMap(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace,
-	tuningObj *model.PresetParam, kubeClient client.Client) (*corev1.ConfigMap, error) {
-	// Copy Configmap from helm chart configmap into workspace
+	kubeClient client.Client) (*corev1.ConfigMap, error) {
+	tuningConfigMapName := workspaceObj.Tuning.ConfigTemplate
+	if tuningConfigMapName == "" {
+		if workspaceObj.Tuning.Method == kaitov1alpha1.TuningMethodLora {
+			tuningConfigMapName = kaitov1alpha1.DefaultLoraConfigMapTemplate
+		} else if workspaceObj.Tuning.Method == kaitov1alpha1.TuningMethodQLora {
+			tuningConfigMapName = kaitov1alpha1.DefaultQloraConfigMapTemplate
+		}
+	}
+
+	// Check if intended configmap already exists in target namespace
 	existingCM := &corev1.ConfigMap{}
-	err := resources.GetResource(ctx, workspaceObj.Tuning.ConfigTemplate, workspaceObj.Namespace, kubeClient, existingCM)
+	err := resources.GetResource(ctx, tuningConfigMapName, workspaceObj.Namespace, kubeClient, existingCM)
 	if err != nil {
 		if !errors.IsNotFound(err) {
 			return nil, err
@@ -104,7 +122,7 @@ func EnsureTuningConfigMap(ctx context.Context, workspaceObj *kaitov1alpha1.Work
 		return nil, fmt.Errorf("failed to get release namespace: %v", err)
 	}
 	templateCM := &corev1.ConfigMap{}
-	err = resources.GetResource(ctx, workspaceObj.Tuning.ConfigTemplate, releaseNamespace, kubeClient, templateCM)
+	err = resources.GetResource(ctx, tuningConfigMapName, releaseNamespace, kubeClient, templateCM)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get ConfigMap from template namespace: %v", err)
 	}
@@ -222,7 +240,7 @@ func SetupTrainingOutputVolume(ctx context.Context, configMap *corev1.ConfigMap)
 	return resultsVolume, resultsVolumeMount, outputDir
 }
 
-func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace) ([]corev1.Volume, []corev1.VolumeMount) {
+func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace, cmName string) ([]corev1.Volume, []corev1.VolumeMount) {
 	var volumes []corev1.Volume
 	var volumeMounts []corev1.VolumeMount
 
@@ -236,7 +254,7 @@ func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace) ([]corev1.
 	}
 
 	// Add shared volume for tuning parameters
-	cmVolume, cmVolumeMount := utils.ConfigCMVolume(workspaceObj.Tuning.ConfigTemplate)
+	cmVolume, cmVolumeMount := utils.ConfigCMVolume(cmName)
 	volumes = append(volumes, cmVolume)
 	volumeMounts = append(volumeMounts, cmVolumeMount)
 
@@ -245,14 +263,14 @@ func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace) ([]corev1.
 
 func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace,
 	tuningObj *model.PresetParam, kubeClient client.Client) (client.Object, error) {
-	var initContainers, sidecarContainers []corev1.Container
-	volumes, volumeMounts := setupDefaultSharedVolumes(workspaceObj)
-
-	cm, err := EnsureTuningConfigMap(ctx, workspaceObj, tuningObj, kubeClient)
+	cm, err := EnsureTuningConfigMap(ctx, workspaceObj, kubeClient)
 	if err != nil {
 		return nil, err
 	}
 
+	var initContainers, sidecarContainers []corev1.Container
+	volumes, volumeMounts := setupDefaultSharedVolumes(workspaceObj, cm.Name)
+
 	// Add shared volume for training output
 	trainingOutputVolume, trainingOutputVolumeMount, outputDir := SetupTrainingOutputVolume(ctx, cm)
 	volumes = append(volumes, trainingOutputVolume)

diff --git a/pkg/tuning/preset-tuning_test.go b/pkg/tuning/preset-tuning_test.go
@@ -239,7 +239,7 @@ func TestEnsureTuningConfigMap(t *testing.T) {
 			mockClient := test.NewClient()
 			tc.callMocks(mockClient)
 			tc.workspaceObj.SetNamespace("workspace-namespace")
-			_, err := EnsureTuningConfigMap(context.Background(), tc.workspaceObj, nil, mockClient)
+			_, err := EnsureTuningConfigMap(context.Background(), tc.workspaceObj, mockClient)
 			if tc.expectedError != "" {
 				assert.EqualError(t, err, tc.expectedError)
 			} else {

diff --git a/presets/models/falcon/model.go b/presets/models/falcon/model.go
@@ -80,9 +80,10 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam {
 		PerGPUMemoryRequirement:   "16Gi",
 		TorchRunParams:            tuning.DefaultAccelerateParams,
 		//ModelRunPrams:             falconRunTuningParams, // TODO
-		ReadinessTimeout: time.Duration(30) * time.Minute,
-		BaseCommand:      baseCommandPresetFalcon,
-		Tag:              PresetFalconTagMap["Falcon7B"],
+		ReadinessTimeout:              time.Duration(30) * time.Minute,
+		BaseCommand:                   baseCommandPresetFalcon,
+		Tag:                           PresetFalconTagMap["Falcon7B"],
+		TuningPerGPUMemoryRequirement: map[string]int{"qlora": 16},
 	}
 }