From 17360709a598079ddf180221190e379c2bac7245 Mon Sep 17 00:00:00 2001 From: Akshay Chitneni Date: Mon, 28 Oct 2024 10:18:21 -0700 Subject: [PATCH] Adding cel validation on trainingRuntime CRD Signed-off-by: Akshay Chitneni --- api.v2/openapi-spec/swagger.json | 4 +- .../kubeflow.org_clustertrainingruntimes.yaml | 18 ++- .../crds/kubeflow.org_trainingruntimes.yaml | 18 ++- .../v2/base/crds/kubeflow.org_trainjobs.yaml | 5 +- .../v2alpha1/trainingruntime_types.go | 10 +- .../kubeflow.org/v2alpha1/trainjob_types.go | 3 +- .../v2alpha1/zz_generated.deepcopy.go | 5 +- .../v2alpha1/zz_generated.openapi.go | 10 +- .../v2alpha1/torchmlpolicysource.go | 8 +- .../kubeflow.org/v2alpha1/trainer.go | 5 +- pkg/runtime.v2/core/trainingruntime_test.go | 7 +- .../framework/plugins/torch/torch.go | 10 +- pkg/util.v2/testing/wrapper.go | 5 +- .../KubeflowOrgV2alpha1TorchMLPolicySource.md | 2 +- sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md | 2 +- ...low_org_v2alpha1_torch_ml_policy_source.py | 8 +- .../models/kubeflow_org_v2alpha1_trainer.py | 8 +- .../webhook.v2/clustertrainingruntime_test.go | 7 +- .../webhook.v2/trainingruntime_test.go | 126 ++++++++++++++++++ 19 files changed, 220 insertions(+), 41 deletions(-) diff --git a/api.v2/openapi-spec/swagger.json b/api.v2/openapi-spec/swagger.json index 077820f620..e82feee139 100644 --- a/api.v2/openapi-spec/swagger.json +++ b/api.v2/openapi-spec/swagger.json @@ -517,7 +517,7 @@ }, "numProcPerNode": { "description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.", - "type": "string" + "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString" } } }, @@ -716,7 +716,7 @@ }, "numProcPerNode": { "description": "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.", - "type": "string" + "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString" }, "resourcesPerNode": { "description": "Compute resources for each training node.", diff --git a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml index 4d281801c1..125aa757cb 100644 --- a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml +++ b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml @@ -50,6 +50,7 @@ spec: description: Configuration for the MPI Runtime. properties: mpiImplementation: + default: OpenMPI description: |- Implementation name for the MPI to create the appropriate hostfile. Defaults to OpenMPI. @@ -61,6 +62,7 @@ spec: format: int32 type: integer runLauncherAsNode: + default: false description: |- Whether to run training process on the launcher Job. Defaults to false. @@ -583,14 +585,27 @@ spec: type: integer type: object numProcPerNode: + anyOf: + - type: integer + - type: string + default: auto description: |- Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`. - type: string + x-kubernetes-int-or-string: true + x-kubernetes-validations: + - message: NumProcPerNode must be equal to auto, cpu, gpu, + or int value + rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int type: object type: object + x-kubernetes-validations: + - message: numNodes should not be set if torch.elasticPolicy is configured + rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))' + - message: Only one of the policy can be configured + rule: '!(has(self.torch) && has(self.mpi))' podGroupPolicy: description: Configuration for the PodGroup to enable gang-scheduling via supported plugins. @@ -600,6 +615,7 @@ spec: for gang-scheduling. properties: scheduleTimeoutSeconds: + default: 60 description: |- Time threshold to schedule PodGroup for gang-scheduling. If the scheduling timeout is equal to 0, the default value is used. diff --git a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml index 0ae165315c..1cd09b1bdc 100644 --- a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml +++ b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml @@ -50,6 +50,7 @@ spec: description: Configuration for the MPI Runtime. properties: mpiImplementation: + default: OpenMPI description: |- Implementation name for the MPI to create the appropriate hostfile. Defaults to OpenMPI. @@ -61,6 +62,7 @@ spec: format: int32 type: integer runLauncherAsNode: + default: false description: |- Whether to run training process on the launcher Job. Defaults to false. @@ -583,14 +585,27 @@ spec: type: integer type: object numProcPerNode: + anyOf: + - type: integer + - type: string + default: auto description: |- Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`. - type: string + x-kubernetes-int-or-string: true + x-kubernetes-validations: + - message: NumProcPerNode must be equal to auto, cpu, gpu, + or int value + rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int type: object type: object + x-kubernetes-validations: + - message: numNodes should not be set if torch.elasticPolicy is configured + rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))' + - message: Only one of the policy can be configured + rule: '!(has(self.torch) && has(self.mpi))' podGroupPolicy: description: Configuration for the PodGroup to enable gang-scheduling via supported plugins. @@ -600,6 +615,7 @@ spec: for gang-scheduling. properties: scheduleTimeoutSeconds: + default: 60 description: |- Time threshold to schedule PodGroup for gang-scheduling. If the scheduling timeout is equal to 0, the default value is used. diff --git a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml index 037e04191d..c22cc27896 100644 --- a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml +++ b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml @@ -3138,11 +3138,14 @@ spec: format: int32 type: integer numProcPerNode: + anyOf: + - type: integer + - type: string description: |- Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set. - type: string + x-kubernetes-int-or-string: true resourcesPerNode: description: Compute resources for each training node. properties: diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index d2ddceb140..b8d4a067b3 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -19,6 +19,7 @@ package v2alpha1 import ( autoscalingv2 "k8s.io/api/autoscaling/v2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" ) @@ -142,10 +143,13 @@ type CoschedulingPodGroupPolicySource struct { // Time threshold to schedule PodGroup for gang-scheduling. // If the scheduling timeout is equal to 0, the default value is used. // Defaults to 60 seconds. + // +kubebuilder:default=60 ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` } // MLPolicy represents configuration for the model trining with ML-specific parameters. +// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured" +// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured" type MLPolicy struct { // Number of training nodes. // Defaults to 1. @@ -173,7 +177,9 @@ type TorchMLPolicySource struct { // Supported values: `auto`, `cpu`, `gpu`, or int value. // TODO (andreyvelich): Add kubebuilder validation. // Defaults to `auto`. - NumProcPerNode *string `json:"numProcPerNode,omitempty"` + // +kubebuilder:default="auto" + // +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value" + NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"` // Elastic policy for the PyTorch training. ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"` @@ -210,6 +216,7 @@ type MPIMLPolicySource struct { // Implementation name for the MPI to create the appropriate hostfile. // Defaults to OpenMPI. + // +kubebuilder:default="OpenMPI" MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"` // Directory where SSH keys are mounted. @@ -217,6 +224,7 @@ type MPIMLPolicySource struct { // Whether to run training process on the launcher Job. // Defaults to false. + // +kubebuilder:default=false RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"` } diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go index 04f995c1fa..c1e2971d21 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go @@ -19,6 +19,7 @@ package v2alpha1 import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" ) const ( @@ -194,7 +195,7 @@ type Trainer struct { // Number of processes/workers/slots on every training node. // For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. // For the MPI runtime only int value can be set. - NumProcPerNode *string `json:"numProcPerNode,omitempty"` + NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"` } // DatasetConfig represents the desired dataset configuration. diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go index cb4b3a5eeb..9c408b132e 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go @@ -24,6 +24,7 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" + intstr "k8s.io/apimachinery/pkg/util/intstr" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. @@ -576,7 +577,7 @@ func (in *TorchMLPolicySource) DeepCopyInto(out *TorchMLPolicySource) { *out = *in if in.NumProcPerNode != nil { in, out := &in.NumProcPerNode, &out.NumProcPerNode - *out = new(string) + *out = new(intstr.IntOrString) **out = **in } if in.ElasticPolicy != nil { @@ -786,7 +787,7 @@ func (in *Trainer) DeepCopyInto(out *Trainer) { } if in.NumProcPerNode != nil { in, out := &in.NumProcPerNode, &out.NumProcPerNode - *out = new(string) + *out = new(intstr.IntOrString) **out = **in } return diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go index 5248d0ef02..da42681fad 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go +++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go @@ -974,8 +974,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TorchMLPolicySource(ref common.Referen "numProcPerNode": { SchemaProps: spec.SchemaProps{ Description: "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.", - Type: []string{"string"}, - Format: "", + Ref: ref("k8s.io/apimachinery/pkg/util/intstr.IntOrString"), }, }, "elasticPolicy": { @@ -988,7 +987,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TorchMLPolicySource(ref common.Referen }, }, Dependencies: []string{ - "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchElasticPolicy"}, + "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchElasticPolicy", "k8s.io/apimachinery/pkg/util/intstr.IntOrString"}, } } @@ -1352,15 +1351,14 @@ func schema_pkg_apis_kubefloworg_v2alpha1_Trainer(ref common.ReferenceCallback) "numProcPerNode": { SchemaProps: spec.SchemaProps{ Description: "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.", - Type: []string{"string"}, - Format: "", + Ref: ref("k8s.io/apimachinery/pkg/util/intstr.IntOrString"), }, }, }, }, }, Dependencies: []string{ - "k8s.io/api/core/v1.EnvVar", "k8s.io/api/core/v1.ResourceRequirements"}, + "k8s.io/api/core/v1.EnvVar", "k8s.io/api/core/v1.ResourceRequirements", "k8s.io/apimachinery/pkg/util/intstr.IntOrString"}, } } diff --git a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go index 401234ac01..8a7c4883a2 100644 --- a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go +++ b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go @@ -16,10 +16,14 @@ package v2alpha1 +import ( + intstr "k8s.io/apimachinery/pkg/util/intstr" +) + // TorchMLPolicySourceApplyConfiguration represents a declarative configuration of the TorchMLPolicySource type for use // with apply. type TorchMLPolicySourceApplyConfiguration struct { - NumProcPerNode *string `json:"numProcPerNode,omitempty"` + NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"` ElasticPolicy *TorchElasticPolicyApplyConfiguration `json:"elasticPolicy,omitempty"` } @@ -32,7 +36,7 @@ func TorchMLPolicySource() *TorchMLPolicySourceApplyConfiguration { // WithNumProcPerNode sets the NumProcPerNode field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the NumProcPerNode field is set to the value of the last call. -func (b *TorchMLPolicySourceApplyConfiguration) WithNumProcPerNode(value string) *TorchMLPolicySourceApplyConfiguration { +func (b *TorchMLPolicySourceApplyConfiguration) WithNumProcPerNode(value intstr.IntOrString) *TorchMLPolicySourceApplyConfiguration { b.NumProcPerNode = &value return b } diff --git a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go index 49d991a440..f8d19c1275 100644 --- a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go +++ b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go @@ -18,6 +18,7 @@ package v2alpha1 import ( v1 "k8s.io/api/core/v1" + intstr "k8s.io/apimachinery/pkg/util/intstr" ) // TrainerApplyConfiguration represents a declarative configuration of the Trainer type for use @@ -29,7 +30,7 @@ type TrainerApplyConfiguration struct { Env []v1.EnvVar `json:"env,omitempty"` NumNodes *int32 `json:"numNodes,omitempty"` ResourcesPerNode *v1.ResourceRequirements `json:"resourcesPerNode,omitempty"` - NumProcPerNode *string `json:"numProcPerNode,omitempty"` + NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"` } // TrainerApplyConfiguration constructs a declarative configuration of the Trainer type for use with @@ -95,7 +96,7 @@ func (b *TrainerApplyConfiguration) WithResourcesPerNode(value v1.ResourceRequir // WithNumProcPerNode sets the NumProcPerNode field in the declarative configuration to the given value // and returns the receiver, so that objects can be built by chaining "With" function invocations. // If called multiple times, the NumProcPerNode field is set to the value of the last call. -func (b *TrainerApplyConfiguration) WithNumProcPerNode(value string) *TrainerApplyConfiguration { +func (b *TrainerApplyConfiguration) WithNumProcPerNode(value intstr.IntOrString) *TrainerApplyConfiguration { b.NumProcPerNode = &value return b } diff --git a/pkg/runtime.v2/core/trainingruntime_test.go b/pkg/runtime.v2/core/trainingruntime_test.go index 2a11716cd3..cbc54efa06 100644 --- a/pkg/runtime.v2/core/trainingruntime_test.go +++ b/pkg/runtime.v2/core/trainingruntime_test.go @@ -19,6 +19,7 @@ package core import ( "context" "fmt" + "k8s.io/apimachinery/pkg/util/intstr" "testing" "github.com/google/go-cmp/cmp" @@ -263,7 +264,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) { "succeeded to build JobSet with Torch values from the TrainJob": { trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec( testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec). - TorchPolicy(100, "auto"). + TorchPolicy(100, intstr.FromString("auto")). ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests). Obj(), ).Obj(), @@ -273,7 +274,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) { Trainer( testingutil.MakeTrainJobTrainerWrapper(). NumNodes(30). - NumProcPerNode("3"). + NumProcPerNode(intstr.FromInt32(3)). Obj(), ). Obj(), @@ -317,7 +318,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) { "succeeded to build JobSet with Torch values from the Runtime and envs.": { trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec( testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec). - TorchPolicy(100, "auto"). + TorchPolicy(100, intstr.FromString("auto")). ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests). ContainerTrainerEnv( []corev1.EnvVar{ diff --git a/pkg/runtime.v2/framework/plugins/torch/torch.go b/pkg/runtime.v2/framework/plugins/torch/torch.go index 4e9c40585f..2b69544d6f 100644 --- a/pkg/runtime.v2/framework/plugins/torch/torch.go +++ b/pkg/runtime.v2/framework/plugins/torch/torch.go @@ -19,6 +19,8 @@ package torch import ( "context" "fmt" + "k8s.io/apimachinery/pkg/util/intstr" + "strconv" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/sets" @@ -71,6 +73,12 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *kubeflowv2.TrainJo // TODO (andreyvelich): Add validation to check that TrainJob doesn't have "PET_" envs. // TODO (andreyvelich): We should validate that envs from different plugins don't conflict with each other. // Ref: https://github.com/kubeflow/training-operator/pull/2308#discussion_r1823229940 + var numProcPerNodeVal string + if numProcPerNode.Type == intstr.Int { + numProcPerNodeVal = strconv.Itoa(int(numProcPerNode.IntVal)) + } else { + numProcPerNodeVal = numProcPerNode.StrVal + } infoEnvs := []corev1.EnvVar{ { Name: constants.TorchEnvNumNodes, @@ -78,7 +86,7 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *kubeflowv2.TrainJo }, { Name: constants.TorchEnvNumProcPerNode, - Value: ptr.Deref(numProcPerNode, "auto"), + Value: numProcPerNodeVal, }, { Name: constants.TorchEnvNodeRank, diff --git a/pkg/util.v2/testing/wrapper.go b/pkg/util.v2/testing/wrapper.go index fdbb3dd6c7..ee8547c5d9 100644 --- a/pkg/util.v2/testing/wrapper.go +++ b/pkg/util.v2/testing/wrapper.go @@ -22,6 +22,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2" schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" @@ -392,7 +393,7 @@ func (t *TrainJobTrainerWrapper) NumNodes(numNodes int32) *TrainJobTrainerWrappe return t } -func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode string) *TrainJobTrainerWrapper { +func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode intstr.IntOrString) *TrainJobTrainerWrapper { t.Trainer.NumProcPerNode = &numProcPerNode return t } @@ -689,7 +690,7 @@ func (s *TrainingRuntimeSpecWrapper) NumNodes(numNodes int32) *TrainingRuntimeSp return s } -func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode string) *TrainingRuntimeSpecWrapper { +func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode intstr.IntOrString) *TrainingRuntimeSpecWrapper { s.MLPolicy = &kubeflowv2.MLPolicy{ NumNodes: &numNodes, MLPolicySource: kubeflowv2.MLPolicySource{ diff --git a/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md b/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md index 72c2a70129..95b14f6b53 100644 --- a/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md +++ b/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md @@ -5,7 +5,7 @@ TorchMLPolicySource represents a PyTorch runtime configuration. Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- **elastic_policy** | [**KubeflowOrgV2alpha1TorchElasticPolicy**](KubeflowOrgV2alpha1TorchElasticPolicy.md) | | [optional] -**num_proc_per_node** | **str** | Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`. | [optional] +**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) | | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md b/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md index f69b5ba330..f31f0f0660 100644 --- a/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md +++ b/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md @@ -9,7 +9,7 @@ Name | Type | Description | Notes **env** | [**list[V1EnvVar]**](V1EnvVar.md) | List of environment variables to set in the training container. These values will be merged with the TrainingRuntime's trainer environments. | [optional] **image** | **str** | Docker image for the training container. | [optional] **num_nodes** | **int** | Number of training nodes. | [optional] -**num_proc_per_node** | **str** | Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set. | [optional] +**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) | | [optional] **resources_per_node** | [**V1ResourceRequirements**](V1ResourceRequirements.md) | | [optional] [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md) diff --git a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py index 5cb0d5aa53..c61971ad01 100644 --- a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py +++ b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py @@ -34,7 +34,7 @@ class KubeflowOrgV2alpha1TorchMLPolicySource(object): """ openapi_types = { 'elastic_policy': 'KubeflowOrgV2alpha1TorchElasticPolicy', - 'num_proc_per_node': 'str' + 'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString' } attribute_map = { @@ -82,10 +82,9 @@ def elastic_policy(self, elastic_policy): def num_proc_per_node(self): """Gets the num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource. # noqa: E501 - Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`. # noqa: E501 :return: The num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource. # noqa: E501 - :rtype: str + :rtype: K8sIoApimachineryPkgUtilIntstrIntOrString """ return self._num_proc_per_node @@ -93,10 +92,9 @@ def num_proc_per_node(self): def num_proc_per_node(self, num_proc_per_node): """Sets the num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource. - Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`. # noqa: E501 :param num_proc_per_node: The num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource. # noqa: E501 - :type: str + :type: K8sIoApimachineryPkgUtilIntstrIntOrString """ self._num_proc_per_node = num_proc_per_node diff --git a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py index e82d160b96..73c0b81b89 100644 --- a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py +++ b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py @@ -38,7 +38,7 @@ class KubeflowOrgV2alpha1Trainer(object): 'env': 'list[V1EnvVar]', 'image': 'str', 'num_nodes': 'int', - 'num_proc_per_node': 'str', + 'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString', 'resources_per_node': 'V1ResourceRequirements' } @@ -201,10 +201,9 @@ def num_nodes(self, num_nodes): def num_proc_per_node(self): """Gets the num_proc_per_node of this KubeflowOrgV2alpha1Trainer. # noqa: E501 - Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set. # noqa: E501 :return: The num_proc_per_node of this KubeflowOrgV2alpha1Trainer. # noqa: E501 - :rtype: str + :rtype: K8sIoApimachineryPkgUtilIntstrIntOrString """ return self._num_proc_per_node @@ -212,10 +211,9 @@ def num_proc_per_node(self): def num_proc_per_node(self, num_proc_per_node): """Sets the num_proc_per_node of this KubeflowOrgV2alpha1Trainer. - Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set. # noqa: E501 :param num_proc_per_node: The num_proc_per_node of this KubeflowOrgV2alpha1Trainer. # noqa: E501 - :type: str + :type: K8sIoApimachineryPkgUtilIntstrIntOrString """ self._num_proc_per_node = num_proc_per_node diff --git a/test/integration/webhook.v2/clustertrainingruntime_test.go b/test/integration/webhook.v2/clustertrainingruntime_test.go index 831937ed3f..88defc0f0d 100644 --- a/test/integration/webhook.v2/clustertrainingruntime_test.go +++ b/test/integration/webhook.v2/clustertrainingruntime_test.go @@ -17,14 +17,13 @@ limitations under the License. package webhookv2 import ( + kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1" + testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing" + "github.com/kubeflow/training-operator/test/integration/framework" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - - kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1" - testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing" - "github.com/kubeflow/training-operator/test/integration/framework" ) const clTrainingRuntimeName = "test-clustertrainingruntime" diff --git a/test/integration/webhook.v2/trainingruntime_test.go b/test/integration/webhook.v2/trainingruntime_test.go index 0de7cd2250..f474ee6ca2 100644 --- a/test/integration/webhook.v2/trainingruntime_test.go +++ b/test/integration/webhook.v2/trainingruntime_test.go @@ -17,10 +17,13 @@ limitations under the License. package webhookv2 import ( + "github.com/kubeflow/training-operator/test/util" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1" @@ -75,3 +78,126 @@ var _ = ginkgo.Describe("TrainingRuntime Webhook", ginkgo.Ordered, func() { ) }) }) + +var _ = ginkgo.Describe("TrainingRuntime marker validations and defaulting", ginkgo.Ordered, func() { + var ns *corev1.Namespace + + ginkgo.BeforeAll(func() { + fwk = &framework.Framework{} + cfg = fwk.Init() + ctx, k8sClient = fwk.RunManager(cfg) + }) + ginkgo.AfterAll(func() { + fwk.Teardown() + }) + + ginkgo.BeforeEach(func() { + ns = &corev1.Namespace{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "Namespace", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "training-runtime-marker-", + }, + } + gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) + }) + ginkgo.AfterEach(func() { + gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.TrainingRuntime{}, client.InNamespace(ns.Name))).Should(gomega.Succeed()) + gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.ClusterTrainingRuntime{})).Should(gomega.Succeed()) + }) + + ginkgo.When("Creating TrainingRuntime", func() { + ginkgo.DescribeTable("Validate TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, errorMatcher gomega.OmegaMatcher) { + gomega.Expect(k8sClient.Create(ctx, trainingRuntime())).Should(errorMatcher) + }, + ginkgo.Entry("Should succeed to create trainingRuntime", + func() *kubeflowv2.TrainingRuntime { + return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime"). + Obj() + }, + gomega.Succeed()), + ginkgo.Entry("Should fail to create trainingRuntime with both MPI and Torch runtimes", + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{}, + MPI: &kubeflowv2.MPIMLPolicySource{}, + }, + } + return runtime + }, + testingutil.BeInvalidError()), + ginkgo.Entry("Should fail to create trainingRuntime with minNodes and torch.elasticPolicy", + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + NumNodes: ptr.To(int32(2)), + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{ + ElasticPolicy: &kubeflowv2.TorchElasticPolicy{}, + }, + }, + } + return runtime + }, + testingutil.BeInvalidError()), + ) + ginkgo.DescribeTable("Defaulting TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, wantTrainingRuntime func() *kubeflowv2.TrainingRuntime) { + created := trainingRuntime() + gomega.Expect(k8sClient.Create(ctx, created)).Should(gomega.Succeed()) + gomega.Expect(created).Should(gomega.BeComparableTo(wantTrainingRuntime(), util.IgnoreObjectMetadata)) + }, + ginkgo.Entry("Should succeed to default torch.NumProcPerNode=auto", + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{}, + }, + } + return runtime + }, + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{ + NumProcPerNode: ptr.To(intstr.FromString("auto")), + }, + }, + } + runtime.Spec.Template.Spec = testingutil.MakeJobSetWrapper(ns.Name, "runtime"). + Replicas(1).Obj().Spec + return runtime + }), + + ginkgo.Entry("Should succeed to default mpi.mpiImplementation=OpenMPI", + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + MPI: &kubeflowv2.MPIMLPolicySource{}, + }, + } + return runtime + }, + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + MPI: &kubeflowv2.MPIMLPolicySource{ + MPIImplementation: ptr.To(kubeflowv2.MPIImplementationOpenMPI), + RunLauncherAsNode: ptr.To(false), + }, + }, + } + runtime.Spec.Template.Spec = testingutil.MakeJobSetWrapper(ns.Name, "runtime"). + Replicas(1).Obj().Spec + return runtime + }), + ) + }) +})