Skip to content

Commit

Permalink
feat: Add Min GPU Memory Requirement (#443)
Browse files Browse the repository at this point in the history
**Reason for Change**:
1. Add interface field `TuningMethodGPUMemory` - Min GPU memory per
tuning method (batch size 1).
2. Fix bug(s) in tuning ConfigMap creation
  • Loading branch information
ishaansehgal99 authored May 30, 2024
1 parent e24e8b3 commit 76a966b
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 28 deletions.
27 changes: 13 additions & 14 deletions pkg/model/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,21 @@ type Model interface {

// PresetParam defines the preset inference parameters for a model.
type PresetParam struct {
ModelFamilyName string // The name of the model family.
ImageAccessMode string // Defines where the Image is Public or Private.
DiskStorageRequirement string // Disk storage requirements for the model.
GPUCountRequirement string // Number of GPUs required for the Preset.
TotalGPUMemoryRequirement string // Total GPU memory required for the Preset.
PerGPUMemoryRequirement string // GPU memory required per GPU.
TorchRunParams map[string]string // Parameters for configuring the torchrun command.
TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
// BaseCommand is the initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
BaseCommand string
ModelRunParams map[string]string // Parameters for running the model training/inference.
ModelFamilyName string // The name of the model family.
ImageAccessMode string // Defines where the Image is Public or Private.
DiskStorageRequirement string // Disk storage requirements for the model.
GPUCountRequirement string // Number of GPUs required for the Preset. Used for inference.
TotalGPUMemoryRequirement string // Total GPU memory required for the Preset. Used for inference.
PerGPUMemoryRequirement string // GPU memory required per GPU. Used for inference.
TuningPerGPUMemoryRequirement map[string]int // Min GPU memory per tuning method (batch size 1). Used for tuning.
TorchRunParams map[string]string // Parameters for configuring the torchrun command.
TorchRunRdzvParams map[string]string // Optional rendezvous parameters for distributed training/inference using torchrun (elastic).
BaseCommand string // The initial command (e.g., 'torchrun', 'accelerate launch') used in the command line.
ModelRunParams map[string]string // Parameters for running the model training/inference.
// ReadinessTimeout defines the maximum duration for creating the workload.
// This timeout accommodates the size of the image, ensuring pull completion
// even under slower network conditions or unforeseen delays.
ReadinessTimeout time.Duration
// WorldSize defines the number of processes required for distributed inference.
WorldSize int
Tag string // The model image tag
WorldSize int // Defines the number of processes required for distributed inference.
Tag string // The model image tag
}
38 changes: 28 additions & 10 deletions pkg/tuning/preset-tuning.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,29 @@ func GetDataSrcImageInfo(ctx context.Context, wObj *kaitov1alpha1.Workspace) (st
return wObj.Tuning.Input.Image, imagePullSecretRefs
}

// EnsureTuningConfigMap handles two scenarios:
// 1. Custom config template specified:
// - Check if it exists in the target namespace.
// - If not, check the release namespace and copy it to the target namespace if found.
//
// 2. No custom config template specified:
// - Use the default config template based on the tuning method (e.g., LoRA or QLoRA).
// - Check if it exists in the target namespace.
// - If not, check the release namespace and copy it to the target namespace if found.
func EnsureTuningConfigMap(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace,
tuningObj *model.PresetParam, kubeClient client.Client) (*corev1.ConfigMap, error) {
// Copy Configmap from helm chart configmap into workspace
kubeClient client.Client) (*corev1.ConfigMap, error) {
tuningConfigMapName := workspaceObj.Tuning.ConfigTemplate
if tuningConfigMapName == "" {
if workspaceObj.Tuning.Method == kaitov1alpha1.TuningMethodLora {
tuningConfigMapName = kaitov1alpha1.DefaultLoraConfigMapTemplate
} else if workspaceObj.Tuning.Method == kaitov1alpha1.TuningMethodQLora {
tuningConfigMapName = kaitov1alpha1.DefaultQloraConfigMapTemplate
}
}

// Check if intended configmap already exists in target namespace
existingCM := &corev1.ConfigMap{}
err := resources.GetResource(ctx, workspaceObj.Tuning.ConfigTemplate, workspaceObj.Namespace, kubeClient, existingCM)
err := resources.GetResource(ctx, tuningConfigMapName, workspaceObj.Namespace, kubeClient, existingCM)
if err != nil {
if !errors.IsNotFound(err) {
return nil, err
Expand All @@ -104,7 +122,7 @@ func EnsureTuningConfigMap(ctx context.Context, workspaceObj *kaitov1alpha1.Work
return nil, fmt.Errorf("failed to get release namespace: %v", err)
}
templateCM := &corev1.ConfigMap{}
err = resources.GetResource(ctx, workspaceObj.Tuning.ConfigTemplate, releaseNamespace, kubeClient, templateCM)
err = resources.GetResource(ctx, tuningConfigMapName, releaseNamespace, kubeClient, templateCM)
if err != nil {
return nil, fmt.Errorf("failed to get ConfigMap from template namespace: %v", err)
}
Expand Down Expand Up @@ -222,7 +240,7 @@ func SetupTrainingOutputVolume(ctx context.Context, configMap *corev1.ConfigMap)
return resultsVolume, resultsVolumeMount, outputDir
}

func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace) ([]corev1.Volume, []corev1.VolumeMount) {
func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace, cmName string) ([]corev1.Volume, []corev1.VolumeMount) {
var volumes []corev1.Volume
var volumeMounts []corev1.VolumeMount

Expand All @@ -236,7 +254,7 @@ func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace) ([]corev1.
}

// Add shared volume for tuning parameters
cmVolume, cmVolumeMount := utils.ConfigCMVolume(workspaceObj.Tuning.ConfigTemplate)
cmVolume, cmVolumeMount := utils.ConfigCMVolume(cmName)
volumes = append(volumes, cmVolume)
volumeMounts = append(volumeMounts, cmVolumeMount)

Expand All @@ -245,14 +263,14 @@ func setupDefaultSharedVolumes(workspaceObj *kaitov1alpha1.Workspace) ([]corev1.

func CreatePresetTuning(ctx context.Context, workspaceObj *kaitov1alpha1.Workspace,
tuningObj *model.PresetParam, kubeClient client.Client) (client.Object, error) {
var initContainers, sidecarContainers []corev1.Container
volumes, volumeMounts := setupDefaultSharedVolumes(workspaceObj)

cm, err := EnsureTuningConfigMap(ctx, workspaceObj, tuningObj, kubeClient)
cm, err := EnsureTuningConfigMap(ctx, workspaceObj, kubeClient)
if err != nil {
return nil, err
}

var initContainers, sidecarContainers []corev1.Container
volumes, volumeMounts := setupDefaultSharedVolumes(workspaceObj, cm.Name)

// Add shared volume for training output
trainingOutputVolume, trainingOutputVolumeMount, outputDir := SetupTrainingOutputVolume(ctx, cm)
volumes = append(volumes, trainingOutputVolume)
Expand Down
2 changes: 1 addition & 1 deletion pkg/tuning/preset-tuning_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ func TestEnsureTuningConfigMap(t *testing.T) {
mockClient := test.NewClient()
tc.callMocks(mockClient)
tc.workspaceObj.SetNamespace("workspace-namespace")
_, err := EnsureTuningConfigMap(context.Background(), tc.workspaceObj, nil, mockClient)
_, err := EnsureTuningConfigMap(context.Background(), tc.workspaceObj, mockClient)
if tc.expectedError != "" {
assert.EqualError(t, err, tc.expectedError)
} else {
Expand Down
7 changes: 4 additions & 3 deletions presets/models/falcon/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,10 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam {
PerGPUMemoryRequirement: "16Gi",
TorchRunParams: tuning.DefaultAccelerateParams,
//ModelRunPrams: falconRunTuningParams, // TODO
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetFalcon,
Tag: PresetFalconTagMap["Falcon7B"],
ReadinessTimeout: time.Duration(30) * time.Minute,
BaseCommand: baseCommandPresetFalcon,
Tag: PresetFalconTagMap["Falcon7B"],
TuningPerGPUMemoryRequirement: map[string]int{"qlora": 16},
}
}

Expand Down

0 comments on commit 76a966b

Please sign in to comment.