From 17360709a598079ddf180221190e379c2bac7245 Mon Sep 17 00:00:00 2001
From: Akshay Chitneni <achitneni@apple.com>
Date: Mon, 28 Oct 2024 10:18:21 -0700
Subject: [PATCH] Adding cel validation on trainingRuntime CRD

Signed-off-by: Akshay Chitneni <achitneni@apple.com>
---
 api.v2/openapi-spec/swagger.json              |   4 +-
 .../kubeflow.org_clustertrainingruntimes.yaml |  18 ++-
 .../crds/kubeflow.org_trainingruntimes.yaml   |  18 ++-
 .../v2/base/crds/kubeflow.org_trainjobs.yaml  |   5 +-
 .../v2alpha1/trainingruntime_types.go         |  10 +-
 .../kubeflow.org/v2alpha1/trainjob_types.go   |   3 +-
 .../v2alpha1/zz_generated.deepcopy.go         |   5 +-
 .../v2alpha1/zz_generated.openapi.go          |  10 +-
 .../v2alpha1/torchmlpolicysource.go           |   8 +-
 .../kubeflow.org/v2alpha1/trainer.go          |   5 +-
 pkg/runtime.v2/core/trainingruntime_test.go   |   7 +-
 .../framework/plugins/torch/torch.go          |  10 +-
 pkg/util.v2/testing/wrapper.go                |   5 +-
 .../KubeflowOrgV2alpha1TorchMLPolicySource.md |   2 +-
 sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md     |   2 +-
 ...low_org_v2alpha1_torch_ml_policy_source.py |   8 +-
 .../models/kubeflow_org_v2alpha1_trainer.py   |   8 +-
 .../webhook.v2/clustertrainingruntime_test.go |   7 +-
 .../webhook.v2/trainingruntime_test.go        | 126 ++++++++++++++++++
 19 files changed, 220 insertions(+), 41 deletions(-)

diff --git a/api.v2/openapi-spec/swagger.json b/api.v2/openapi-spec/swagger.json
index 077820f620..e82feee139 100644
--- a/api.v2/openapi-spec/swagger.json
+++ b/api.v2/openapi-spec/swagger.json
@@ -517,7 +517,7 @@
         },
         "numProcPerNode": {
           "description": "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
-          "type": "string"
+          "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
         }
       }
     },
@@ -716,7 +716,7 @@
         },
         "numProcPerNode": {
           "description": "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.",
-          "type": "string"
+          "$ref": "#/definitions/k8s.io.apimachinery.pkg.util.intstr.IntOrString"
         },
         "resourcesPerNode": {
           "description": "Compute resources for each training node.",
diff --git a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
index 4d281801c1..125aa757cb 100644
--- a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
@@ -50,6 +50,7 @@ spec:
                     description: Configuration for the MPI Runtime.
                     properties:
                       mpiImplementation:
+                        default: OpenMPI
                         description: |-
                           Implementation name for the MPI to create the appropriate hostfile.
                           Defaults to OpenMPI.
@@ -61,6 +62,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -583,14 +585,27 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
-                        type: string
+                        x-kubernetes-int-or-string: true
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be equal to auto, cpu, gpu,
+                            or int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -600,6 +615,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.
diff --git a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
index 0ae165315c..1cd09b1bdc 100644
--- a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
@@ -50,6 +50,7 @@ spec:
                     description: Configuration for the MPI Runtime.
                     properties:
                       mpiImplementation:
+                        default: OpenMPI
                         description: |-
                           Implementation name for the MPI to create the appropriate hostfile.
                           Defaults to OpenMPI.
@@ -61,6 +62,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -583,14 +585,27 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        anyOf:
+                        - type: integer
+                        - type: string
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
-                        type: string
+                        x-kubernetes-int-or-string: true
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be equal to auto, cpu, gpu,
+                            or int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -600,6 +615,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.
diff --git a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
index 037e04191d..c22cc27896 100644
--- a/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_trainjobs.yaml
@@ -3138,11 +3138,14 @@ spec:
                     format: int32
                     type: integer
                   numProcPerNode:
+                    anyOf:
+                    - type: integer
+                    - type: string
                     description: |-
                       Number of processes/workers/slots on every training node.
                       For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
                       For the MPI runtime only int value can be set.
-                    type: string
+                    x-kubernetes-int-or-string: true
                   resourcesPerNode:
                     description: Compute resources for each training node.
                     properties:
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index d2ddceb140..b8d4a067b3 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -19,6 +19,7 @@ package v2alpha1
 import (
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 )
 
@@ -142,10 +143,13 @@ type CoschedulingPodGroupPolicySource struct {
 	// Time threshold to schedule PodGroup for gang-scheduling.
 	// If the scheduling timeout is equal to 0, the default value is used.
 	// Defaults to 60 seconds.
+	// +kubebuilder:default=60
 	ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
 }
 
 // MLPolicy represents configuration for the model trining with ML-specific parameters.
+// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured"
+// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured"
 type MLPolicy struct {
 	// Number of training nodes.
 	// Defaults to 1.
@@ -173,7 +177,9 @@ type TorchMLPolicySource struct {
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
 	// TODO (andreyvelich): Add kubebuilder validation.
 	// Defaults to `auto`.
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+	// +kubebuilder:default="auto"
+	// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value"
+	NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
 
 	// Elastic policy for the PyTorch training.
 	ElasticPolicy *TorchElasticPolicy `json:"elasticPolicy,omitempty"`
@@ -210,6 +216,7 @@ type MPIMLPolicySource struct {
 
 	// Implementation name for the MPI to create the appropriate hostfile.
 	// Defaults to OpenMPI.
+	// +kubebuilder:default="OpenMPI"
 	MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"`
 
 	// Directory where SSH keys are mounted.
@@ -217,6 +224,7 @@ type MPIMLPolicySource struct {
 
 	// Whether to run training process on the launcher Job.
 	// Defaults to false.
+	// +kubebuilder:default=false
 	RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
 }
 
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
index 04f995c1fa..c1e2971d21 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainjob_types.go
@@ -19,6 +19,7 @@ package v2alpha1
 import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 )
 
 const (
@@ -194,7 +195,7 @@ type Trainer struct {
 	// Number of processes/workers/slots on every training node.
 	// For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set.
 	// For the MPI runtime only int value can be set.
-	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
+	NumProcPerNode *intstr.IntOrString `json:"numProcPerNode,omitempty"`
 }
 
 // DatasetConfig represents the desired dataset configuration.
diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
index cb4b3a5eeb..9c408b132e 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go
@@ -24,6 +24,7 @@ import (
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	runtime "k8s.io/apimachinery/pkg/runtime"
+	intstr "k8s.io/apimachinery/pkg/util/intstr"
 )
 
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
@@ -576,7 +577,7 @@ func (in *TorchMLPolicySource) DeepCopyInto(out *TorchMLPolicySource) {
 	*out = *in
 	if in.NumProcPerNode != nil {
 		in, out := &in.NumProcPerNode, &out.NumProcPerNode
-		*out = new(string)
+		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	if in.ElasticPolicy != nil {
@@ -786,7 +787,7 @@ func (in *Trainer) DeepCopyInto(out *Trainer) {
 	}
 	if in.NumProcPerNode != nil {
 		in, out := &in.NumProcPerNode, &out.NumProcPerNode
-		*out = new(string)
+		*out = new(intstr.IntOrString)
 		**out = **in
 	}
 	return
diff --git a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go
index 5248d0ef02..da42681fad 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/zz_generated.openapi.go
@@ -974,8 +974,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TorchMLPolicySource(ref common.Referen
 					"numProcPerNode": {
 						SchemaProps: spec.SchemaProps{
 							Description: "Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.",
-							Type:        []string{"string"},
-							Format:      "",
+							Ref:         ref("k8s.io/apimachinery/pkg/util/intstr.IntOrString"),
 						},
 					},
 					"elasticPolicy": {
@@ -988,7 +987,7 @@ func schema_pkg_apis_kubefloworg_v2alpha1_TorchMLPolicySource(ref common.Referen
 			},
 		},
 		Dependencies: []string{
-			"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchElasticPolicy"},
+			"github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1.TorchElasticPolicy", "k8s.io/apimachinery/pkg/util/intstr.IntOrString"},
 	}
 }
 
@@ -1352,15 +1351,14 @@ func schema_pkg_apis_kubefloworg_v2alpha1_Trainer(ref common.ReferenceCallback)
 					"numProcPerNode": {
 						SchemaProps: spec.SchemaProps{
 							Description: "Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.",
-							Type:        []string{"string"},
-							Format:      "",
+							Ref:         ref("k8s.io/apimachinery/pkg/util/intstr.IntOrString"),
 						},
 					},
 				},
 			},
 		},
 		Dependencies: []string{
-			"k8s.io/api/core/v1.EnvVar", "k8s.io/api/core/v1.ResourceRequirements"},
+			"k8s.io/api/core/v1.EnvVar", "k8s.io/api/core/v1.ResourceRequirements", "k8s.io/apimachinery/pkg/util/intstr.IntOrString"},
 	}
 }
 
diff --git a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go
index 401234ac01..8a7c4883a2 100644
--- a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go
+++ b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/torchmlpolicysource.go
@@ -16,10 +16,14 @@
 
 package v2alpha1
 
+import (
+	intstr "k8s.io/apimachinery/pkg/util/intstr"
+)
+
 // TorchMLPolicySourceApplyConfiguration represents a declarative configuration of the TorchMLPolicySource type for use
 // with apply.
 type TorchMLPolicySourceApplyConfiguration struct {
-	NumProcPerNode *string                               `json:"numProcPerNode,omitempty"`
+	NumProcPerNode *intstr.IntOrString                   `json:"numProcPerNode,omitempty"`
 	ElasticPolicy  *TorchElasticPolicyApplyConfiguration `json:"elasticPolicy,omitempty"`
 }
 
@@ -32,7 +36,7 @@ func TorchMLPolicySource() *TorchMLPolicySourceApplyConfiguration {
 // WithNumProcPerNode sets the NumProcPerNode field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
 // If called multiple times, the NumProcPerNode field is set to the value of the last call.
-func (b *TorchMLPolicySourceApplyConfiguration) WithNumProcPerNode(value string) *TorchMLPolicySourceApplyConfiguration {
+func (b *TorchMLPolicySourceApplyConfiguration) WithNumProcPerNode(value intstr.IntOrString) *TorchMLPolicySourceApplyConfiguration {
 	b.NumProcPerNode = &value
 	return b
 }
diff --git a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go
index 49d991a440..f8d19c1275 100644
--- a/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go
+++ b/pkg/client/applyconfiguration/kubeflow.org/v2alpha1/trainer.go
@@ -18,6 +18,7 @@ package v2alpha1
 
 import (
 	v1 "k8s.io/api/core/v1"
+	intstr "k8s.io/apimachinery/pkg/util/intstr"
 )
 
 // TrainerApplyConfiguration represents a declarative configuration of the Trainer type for use
@@ -29,7 +30,7 @@ type TrainerApplyConfiguration struct {
 	Env              []v1.EnvVar              `json:"env,omitempty"`
 	NumNodes         *int32                   `json:"numNodes,omitempty"`
 	ResourcesPerNode *v1.ResourceRequirements `json:"resourcesPerNode,omitempty"`
-	NumProcPerNode   *string                  `json:"numProcPerNode,omitempty"`
+	NumProcPerNode   *intstr.IntOrString      `json:"numProcPerNode,omitempty"`
 }
 
 // TrainerApplyConfiguration constructs a declarative configuration of the Trainer type for use with
@@ -95,7 +96,7 @@ func (b *TrainerApplyConfiguration) WithResourcesPerNode(value v1.ResourceRequir
 // WithNumProcPerNode sets the NumProcPerNode field in the declarative configuration to the given value
 // and returns the receiver, so that objects can be built by chaining "With" function invocations.
 // If called multiple times, the NumProcPerNode field is set to the value of the last call.
-func (b *TrainerApplyConfiguration) WithNumProcPerNode(value string) *TrainerApplyConfiguration {
+func (b *TrainerApplyConfiguration) WithNumProcPerNode(value intstr.IntOrString) *TrainerApplyConfiguration {
 	b.NumProcPerNode = &value
 	return b
 }
diff --git a/pkg/runtime.v2/core/trainingruntime_test.go b/pkg/runtime.v2/core/trainingruntime_test.go
index 2a11716cd3..cbc54efa06 100644
--- a/pkg/runtime.v2/core/trainingruntime_test.go
+++ b/pkg/runtime.v2/core/trainingruntime_test.go
@@ -19,6 +19,7 @@ package core
 import (
 	"context"
 	"fmt"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -263,7 +264,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 		"succeeded to build JobSet with Torch values from the TrainJob": {
 			trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec(
 				testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec).
-					TorchPolicy(100, "auto").
+					TorchPolicy(100, intstr.FromString("auto")).
 					ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
 					Obj(),
 			).Obj(),
@@ -273,7 +274,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 				Trainer(
 					testingutil.MakeTrainJobTrainerWrapper().
 						NumNodes(30).
-						NumProcPerNode("3").
+						NumProcPerNode(intstr.FromInt32(3)).
 						Obj(),
 				).
 				Obj(),
@@ -317,7 +318,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 		"succeeded to build JobSet with Torch values from the Runtime and envs.": {
 			trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec(
 				testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec).
-					TorchPolicy(100, "auto").
+					TorchPolicy(100, intstr.FromString("auto")).
 					ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
 					ContainerTrainerEnv(
 						[]corev1.EnvVar{
diff --git a/pkg/runtime.v2/framework/plugins/torch/torch.go b/pkg/runtime.v2/framework/plugins/torch/torch.go
index 4e9c40585f..2b69544d6f 100644
--- a/pkg/runtime.v2/framework/plugins/torch/torch.go
+++ b/pkg/runtime.v2/framework/plugins/torch/torch.go
@@ -19,6 +19,8 @@ package torch
 import (
 	"context"
 	"fmt"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"strconv"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
@@ -71,6 +73,12 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *kubeflowv2.TrainJo
 	// TODO (andreyvelich): Add validation to check that TrainJob doesn't have "PET_" envs.
 	// TODO (andreyvelich): We should validate that envs from different plugins don't conflict with each other.
 	// Ref: https://github.com/kubeflow/training-operator/pull/2308#discussion_r1823229940
+	var numProcPerNodeVal string
+	if numProcPerNode.Type == intstr.Int {
+		numProcPerNodeVal = strconv.Itoa(int(numProcPerNode.IntVal))
+	} else {
+		numProcPerNodeVal = numProcPerNode.StrVal
+	}
 	infoEnvs := []corev1.EnvVar{
 		{
 			Name:  constants.TorchEnvNumNodes,
@@ -78,7 +86,7 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *kubeflowv2.TrainJo
 		},
 		{
 			Name:  constants.TorchEnvNumProcPerNode,
-			Value: ptr.Deref(numProcPerNode, "auto"),
+			Value: numProcPerNodeVal,
 		},
 		{
 			Name: constants.TorchEnvNodeRank,
diff --git a/pkg/util.v2/testing/wrapper.go b/pkg/util.v2/testing/wrapper.go
index fdbb3dd6c7..ee8547c5d9 100644
--- a/pkg/util.v2/testing/wrapper.go
+++ b/pkg/util.v2/testing/wrapper.go
@@ -22,6 +22,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/utils/ptr"
 	jobsetv1alpha2 "sigs.k8s.io/jobset/api/jobset/v1alpha2"
 	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
@@ -392,7 +393,7 @@ func (t *TrainJobTrainerWrapper) NumNodes(numNodes int32) *TrainJobTrainerWrappe
 	return t
 }
 
-func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode string) *TrainJobTrainerWrapper {
+func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode intstr.IntOrString) *TrainJobTrainerWrapper {
 	t.Trainer.NumProcPerNode = &numProcPerNode
 	return t
 }
@@ -689,7 +690,7 @@ func (s *TrainingRuntimeSpecWrapper) NumNodes(numNodes int32) *TrainingRuntimeSp
 	return s
 }
 
-func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode string) *TrainingRuntimeSpecWrapper {
+func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode intstr.IntOrString) *TrainingRuntimeSpecWrapper {
 	s.MLPolicy = &kubeflowv2.MLPolicy{
 		NumNodes: &numNodes,
 		MLPolicySource: kubeflowv2.MLPolicySource{
diff --git a/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md b/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md
index 72c2a70129..95b14f6b53 100644
--- a/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md
+++ b/sdk_v2/docs/KubeflowOrgV2alpha1TorchMLPolicySource.md
@@ -5,7 +5,7 @@ TorchMLPolicySource represents a PyTorch runtime configuration.
 Name | Type | Description | Notes
 ------------ | ------------- | ------------- | -------------
 **elastic_policy** | [**KubeflowOrgV2alpha1TorchElasticPolicy**](KubeflowOrgV2alpha1TorchElasticPolicy.md) |  | [optional] 
-**num_proc_per_node** | **str** | Number of processes per node. This value is inserted into the &#x60;--nproc-per-node&#x60; argument of the &#x60;torchrun&#x60; CLI. Supported values: &#x60;auto&#x60;, &#x60;cpu&#x60;, &#x60;gpu&#x60;, or int value. Defaults to &#x60;auto&#x60;. | [optional] 
+**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) |  | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
 
diff --git a/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md b/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md
index f69b5ba330..f31f0f0660 100644
--- a/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md
+++ b/sdk_v2/docs/KubeflowOrgV2alpha1Trainer.md
@@ -9,7 +9,7 @@ Name | Type | Description | Notes
 **env** | [**list[V1EnvVar]**](V1EnvVar.md) | List of environment variables to set in the training container. These values will be merged with the TrainingRuntime&#39;s trainer environments. | [optional] 
 **image** | **str** | Docker image for the training container. | [optional] 
 **num_nodes** | **int** | Number of training nodes. | [optional] 
-**num_proc_per_node** | **str** | Number of processes/workers/slots on every training node. For the Torch runtime: &#x60;auto&#x60;, &#x60;cpu&#x60;, &#x60;gpu&#x60;, or int value can be set. For the MPI runtime only int value can be set. | [optional] 
+**num_proc_per_node** | [**K8sIoApimachineryPkgUtilIntstrIntOrString**](K8sIoApimachineryPkgUtilIntstrIntOrString.md) |  | [optional] 
 **resources_per_node** | [**V1ResourceRequirements**](V1ResourceRequirements.md) |  | [optional] 
 
 [[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
diff --git a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py
index 5cb0d5aa53..c61971ad01 100644
--- a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py
+++ b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_torch_ml_policy_source.py
@@ -34,7 +34,7 @@ class KubeflowOrgV2alpha1TorchMLPolicySource(object):
     """
     openapi_types = {
         'elastic_policy': 'KubeflowOrgV2alpha1TorchElasticPolicy',
-        'num_proc_per_node': 'str'
+        'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString'
     }
 
     attribute_map = {
@@ -82,10 +82,9 @@ def elastic_policy(self, elastic_policy):
     def num_proc_per_node(self):
         """Gets the num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource.  # noqa: E501
 
-        Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.  # noqa: E501
 
         :return: The num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource.  # noqa: E501
-        :rtype: str
+        :rtype: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
         return self._num_proc_per_node
 
@@ -93,10 +92,9 @@ def num_proc_per_node(self):
     def num_proc_per_node(self, num_proc_per_node):
         """Sets the num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource.
 
-        Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. Supported values: `auto`, `cpu`, `gpu`, or int value. Defaults to `auto`.  # noqa: E501
 
         :param num_proc_per_node: The num_proc_per_node of this KubeflowOrgV2alpha1TorchMLPolicySource.  # noqa: E501
-        :type: str
+        :type: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
 
         self._num_proc_per_node = num_proc_per_node
diff --git a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py
index e82d160b96..73c0b81b89 100644
--- a/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py
+++ b/sdk_v2/kubeflow/training/models/kubeflow_org_v2alpha1_trainer.py
@@ -38,7 +38,7 @@ class KubeflowOrgV2alpha1Trainer(object):
         'env': 'list[V1EnvVar]',
         'image': 'str',
         'num_nodes': 'int',
-        'num_proc_per_node': 'str',
+        'num_proc_per_node': 'K8sIoApimachineryPkgUtilIntstrIntOrString',
         'resources_per_node': 'V1ResourceRequirements'
     }
 
@@ -201,10 +201,9 @@ def num_nodes(self, num_nodes):
     def num_proc_per_node(self):
         """Gets the num_proc_per_node of this KubeflowOrgV2alpha1Trainer.  # noqa: E501
 
-        Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.  # noqa: E501
 
         :return: The num_proc_per_node of this KubeflowOrgV2alpha1Trainer.  # noqa: E501
-        :rtype: str
+        :rtype: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
         return self._num_proc_per_node
 
@@ -212,10 +211,9 @@ def num_proc_per_node(self):
     def num_proc_per_node(self, num_proc_per_node):
         """Sets the num_proc_per_node of this KubeflowOrgV2alpha1Trainer.
 
-        Number of processes/workers/slots on every training node. For the Torch runtime: `auto`, `cpu`, `gpu`, or int value can be set. For the MPI runtime only int value can be set.  # noqa: E501
 
         :param num_proc_per_node: The num_proc_per_node of this KubeflowOrgV2alpha1Trainer.  # noqa: E501
-        :type: str
+        :type: K8sIoApimachineryPkgUtilIntstrIntOrString
         """
 
         self._num_proc_per_node = num_proc_per_node
diff --git a/test/integration/webhook.v2/clustertrainingruntime_test.go b/test/integration/webhook.v2/clustertrainingruntime_test.go
index 831937ed3f..88defc0f0d 100644
--- a/test/integration/webhook.v2/clustertrainingruntime_test.go
+++ b/test/integration/webhook.v2/clustertrainingruntime_test.go
@@ -17,14 +17,13 @@ limitations under the License.
 package webhookv2
 
 import (
+	kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
+	testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing"
+	"github.com/kubeflow/training-operator/test/integration/framework"
 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
-	kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
-	testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing"
-	"github.com/kubeflow/training-operator/test/integration/framework"
 )
 
 const clTrainingRuntimeName = "test-clustertrainingruntime"
diff --git a/test/integration/webhook.v2/trainingruntime_test.go b/test/integration/webhook.v2/trainingruntime_test.go
index 0de7cd2250..f474ee6ca2 100644
--- a/test/integration/webhook.v2/trainingruntime_test.go
+++ b/test/integration/webhook.v2/trainingruntime_test.go
@@ -17,10 +17,13 @@ limitations under the License.
 package webhookv2
 
 import (
+	"github.com/kubeflow/training-operator/test/util"
 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
@@ -75,3 +78,126 @@ var _ = ginkgo.Describe("TrainingRuntime Webhook", ginkgo.Ordered, func() {
 		)
 	})
 })
+
+var _ = ginkgo.Describe("TrainingRuntime marker validations and defaulting", ginkgo.Ordered, func() {
+	var ns *corev1.Namespace
+
+	ginkgo.BeforeAll(func() {
+		fwk = &framework.Framework{}
+		cfg = fwk.Init()
+		ctx, k8sClient = fwk.RunManager(cfg)
+	})
+	ginkgo.AfterAll(func() {
+		fwk.Teardown()
+	})
+
+	ginkgo.BeforeEach(func() {
+		ns = &corev1.Namespace{
+			TypeMeta: metav1.TypeMeta{
+				APIVersion: corev1.SchemeGroupVersion.String(),
+				Kind:       "Namespace",
+			},
+			ObjectMeta: metav1.ObjectMeta{
+				GenerateName: "training-runtime-marker-",
+			},
+		}
+		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
+	})
+	ginkgo.AfterEach(func() {
+		gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.TrainingRuntime{}, client.InNamespace(ns.Name))).Should(gomega.Succeed())
+		gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.ClusterTrainingRuntime{})).Should(gomega.Succeed())
+	})
+
+	ginkgo.When("Creating TrainingRuntime", func() {
+		ginkgo.DescribeTable("Validate TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, errorMatcher gomega.OmegaMatcher) {
+			gomega.Expect(k8sClient.Create(ctx, trainingRuntime())).Should(errorMatcher)
+		},
+			ginkgo.Entry("Should succeed to create trainingRuntime",
+				func() *kubeflowv2.TrainingRuntime {
+					return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").
+						Obj()
+				},
+				gomega.Succeed()),
+			ginkgo.Entry("Should fail to create trainingRuntime with both MPI and Torch runtimes",
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{},
+							MPI:   &kubeflowv2.MPIMLPolicySource{},
+						},
+					}
+					return runtime
+				},
+				testingutil.BeInvalidError()),
+			ginkgo.Entry("Should fail to create trainingRuntime with minNodes and torch.elasticPolicy",
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						NumNodes: ptr.To(int32(2)),
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{
+								ElasticPolicy: &kubeflowv2.TorchElasticPolicy{},
+							},
+						},
+					}
+					return runtime
+				},
+				testingutil.BeInvalidError()),
+		)
+		ginkgo.DescribeTable("Defaulting TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, wantTrainingRuntime func() *kubeflowv2.TrainingRuntime) {
+			created := trainingRuntime()
+			gomega.Expect(k8sClient.Create(ctx, created)).Should(gomega.Succeed())
+			gomega.Expect(created).Should(gomega.BeComparableTo(wantTrainingRuntime(), util.IgnoreObjectMetadata))
+		},
+			ginkgo.Entry("Should succeed to default torch.NumProcPerNode=auto",
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{},
+						},
+					}
+					return runtime
+				},
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{
+								NumProcPerNode: ptr.To(intstr.FromString("auto")),
+							},
+						},
+					}
+					runtime.Spec.Template.Spec = testingutil.MakeJobSetWrapper(ns.Name, "runtime").
+						Replicas(1).Obj().Spec
+					return runtime
+				}),
+
+			ginkgo.Entry("Should succeed to default mpi.mpiImplementation=OpenMPI",
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							MPI: &kubeflowv2.MPIMLPolicySource{},
+						},
+					}
+					return runtime
+				},
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							MPI: &kubeflowv2.MPIMLPolicySource{
+								MPIImplementation: ptr.To(kubeflowv2.MPIImplementationOpenMPI),
+								RunLauncherAsNode: ptr.To(false),
+							},
+						},
+					}
+					runtime.Spec.Template.Spec = testingutil.MakeJobSetWrapper(ns.Name, "runtime").
+						Replicas(1).Obj().Spec
+					return runtime
+				}),
+		)
+	})
+})