From 6d6d83dfe7d8c3f4f3938c00c04d8b7410caa205 Mon Sep 17 00:00:00 2001 From: yu lin <37265556+Syulin7@users.noreply.github.com> Date: Tue, 23 May 2023 19:10:35 +0800 Subject: [PATCH] Make scheduler-plugins the default gang scheduler. (#1747) Signed-off-by: Syulin7 <735122171@qq.com> --- cmd/training-operator.v1/main.go | 7 ++++--- go.mod | 2 +- go.sum | 4 ++-- hack/python-sdk/swagger.json | 2 +- manifests/base/crds/kubeflow.org_mpijobs.yaml | 2 +- manifests/base/crds/kubeflow.org_mxjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_paddlejobs.yaml | 2 +- manifests/base/crds/kubeflow.org_pytorchjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_tfjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_xgboostjobs.yaml | 2 +- pkg/apis/kubeflow.org/v1/openapi_generated.go | 2 +- sdk/python/docs/V1RunPolicy.md | 2 +- sdk/python/kubeflow/training/models/v1_run_policy.py | 4 ++-- 13 files changed, 18 insertions(+), 17 deletions(-) diff --git a/cmd/training-operator.v1/main.go b/cmd/training-operator.v1/main.go index 164a019c62..da5f780059 100644 --- a/cmd/training-operator.v1/main.go +++ b/cmd/training-operator.v1/main.go @@ -73,7 +73,8 @@ func main() { flag.StringVar(&leaderElectionID, "leader-election-id", "1ca428e5.training-operator.kubeflow.org", "The ID for leader election.") flag.Var(&enabledSchemes, "enable-scheme", "Enable scheme(s) as --enable-scheme=tfjob --enable-scheme=pytorchjob, case insensitive."+ " Now supporting TFJob, PyTorchJob, MXNetJob, XGBoostJob, PaddleJob. By default, all supported schemes will be enabled.") - flag.StringVar(&gangSchedulerName, "gang-scheduler-name", "none", "The scheduler to gang-schedule kubeflow jobs, defaults to none") + flag.StringVar(&gangSchedulerName, "gang-scheduler-name", "", "Now Supporting volcano and scheduler-plugins."+ + " Note: If you set another scheduler name, the training-operator assumes it's the scheduler-plugins.") flag.StringVar(&namespace, "namespace", os.Getenv(commonutil.EnvKubeflowNamespace), "The namespace to monitor kubeflow jobs. If unset, it monitors all namespaces cluster-wide."+ "If set, it only monitors kubeflow jobs in the given namespace.") flag.IntVar(&monitoringPort, "monitoring-port", 9443, "Endpoint port for displaying monitoring metrics. "+ @@ -121,8 +122,8 @@ func main() { cfg := mgr.GetConfig() volcanoClientSet := volcanoclient.NewForConfigOrDie(cfg) gangSchedulingSetupFunc = common.GenVolcanoSetupFunc(volcanoClientSet) - } else if strings.EqualFold(gangSchedulerName, string(common.GangSchedulerSchedulerPlugins)) { - gangSchedulingSetupFunc = common.GenSchedulerPluginsSetupFunc(mgr.GetClient()) + } else if gangSchedulerName != "" { + gangSchedulingSetupFunc = common.GenSchedulerPluginsSetupFunc(mgr.GetClient(), gangSchedulerName) } // TODO: We need a general manager. all rest reconciler addsToManager diff --git a/go.mod b/go.mod index fd90ff5fa6..fdb4343388 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.19 require ( github.com/go-logr/logr v1.2.3 github.com/google/go-cmp v0.5.8 - github.com/kubeflow/common v0.4.6 + github.com/kubeflow/common v0.4.7 github.com/onsi/ginkgo/v2 v2.1.6 github.com/onsi/gomega v1.20.1 github.com/prometheus/client_golang v1.12.2 diff --git a/go.sum b/go.sum index b295f3a6a1..6ffe6a599b 100644 --- a/go.sum +++ b/go.sum @@ -327,8 +327,8 @@ github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= -github.com/kubeflow/common v0.4.6 h1:yzJf/HEdS6ginD0GlVkgbOFie0Sp66VdGjXidAGZIlk= -github.com/kubeflow/common v0.4.6/go.mod h1:43MAof/uhpJA2C0urynqatE3oKFQc7m2HLmJty7waqY= +github.com/kubeflow/common v0.4.7 h1:zz6QS4k2u2FY838M/FjOtwjJq39MRZVZcvPahRYL97M= +github.com/kubeflow/common v0.4.7/go.mod h1:43MAof/uhpJA2C0urynqatE3oKFQc7m2HLmJty7waqY= github.com/mailru/easyjson v0.0.0-20160728113105-d5b7844b561a/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json index 7247abd6ba..a67497c4e7 100644 --- a/hack/python-sdk/swagger.json +++ b/hack/python-sdk/swagger.json @@ -743,7 +743,7 @@ "format": "int32" }, "cleanPodPolicy": { - "description": "CleanPodPolicy defines the policy to kill pods after the job completes. Default to Running.", + "description": "CleanPodPolicy defines the policy to kill pods after the job completes. Default to None.", "type": "string" }, "schedulingPolicy": { diff --git a/manifests/base/crds/kubeflow.org_mpijobs.yaml b/manifests/base/crds/kubeflow.org_mpijobs.yaml index b1a84dc7f7..3f8df97235 100644 --- a/manifests/base/crds/kubeflow.org_mpijobs.yaml +++ b/manifests/base/crds/kubeflow.org_mpijobs.yaml @@ -7374,7 +7374,7 @@ spec: type: integer cleanPodPolicy: description: CleanPodPolicy defines the policy to kill pods after - the job completes. Default to Running. + the job completes. Default to None. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_mxjobs.yaml b/manifests/base/crds/kubeflow.org_mxjobs.yaml index 0b70d7316b..c5475f5f58 100644 --- a/manifests/base/crds/kubeflow.org_mxjobs.yaml +++ b/manifests/base/crds/kubeflow.org_mxjobs.yaml @@ -7374,7 +7374,7 @@ spec: type: integer cleanPodPolicy: description: CleanPodPolicy defines the policy to kill pods after - the job completes. Default to Running. + the job completes. Default to None. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_paddlejobs.yaml b/manifests/base/crds/kubeflow.org_paddlejobs.yaml index 4446769ddb..05c546699a 100644 --- a/manifests/base/crds/kubeflow.org_paddlejobs.yaml +++ b/manifests/base/crds/kubeflow.org_paddlejobs.yaml @@ -7881,7 +7881,7 @@ spec: type: integer cleanPodPolicy: description: CleanPodPolicy defines the policy to kill pods after - the job completes. Default to Running. + the job completes. Default to None. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml index bb2de4e1f7..b38b68eb6d 100644 --- a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml +++ b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml @@ -7910,7 +7910,7 @@ spec: type: integer cleanPodPolicy: description: CleanPodPolicy defines the policy to kill pods after - the job completes. Default to Running. + the job completes. Default to None. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_tfjobs.yaml b/manifests/base/crds/kubeflow.org_tfjobs.yaml index a36d8b1734..a761c3a0f4 100644 --- a/manifests/base/crds/kubeflow.org_tfjobs.yaml +++ b/manifests/base/crds/kubeflow.org_tfjobs.yaml @@ -63,7 +63,7 @@ spec: type: integer cleanPodPolicy: description: CleanPodPolicy defines the policy to kill pods after - the job completes. Default to Running. + the job completes. Default to None. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml index c10ee67f8e..74dbe7a1ae 100644 --- a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml +++ b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml @@ -59,7 +59,7 @@ spec: type: integer cleanPodPolicy: description: CleanPodPolicy defines the policy to kill pods after - the job completes. Default to Running. + the job completes. Default to None. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/pkg/apis/kubeflow.org/v1/openapi_generated.go b/pkg/apis/kubeflow.org/v1/openapi_generated.go index 715eade7a4..20cb293490 100644 --- a/pkg/apis/kubeflow.org/v1/openapi_generated.go +++ b/pkg/apis/kubeflow.org/v1/openapi_generated.go @@ -275,7 +275,7 @@ func schema_pkg_apis_common_v1_RunPolicy(ref common.ReferenceCallback) common.Op Properties: map[string]spec.Schema{ "cleanPodPolicy": { SchemaProps: spec.SchemaProps{ - Description: "CleanPodPolicy defines the policy to kill pods after the job completes. Default to Running.", + Description: "CleanPodPolicy defines the policy to kill pods after the job completes. Default to None.", Type: []string{"string"}, Format: "", }, diff --git a/sdk/python/docs/V1RunPolicy.md b/sdk/python/docs/V1RunPolicy.md index 7a6895e794..5c1d0b18e0 100644 --- a/sdk/python/docs/V1RunPolicy.md +++ b/sdk/python/docs/V1RunPolicy.md @@ -6,7 +6,7 @@ Name | Type | Description | Notes ------------ | ------------- | ------------- | ------------- **active_deadline_seconds** | **int** | Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer. | [optional] **backoff_limit** | **int** | Optional number of retries before marking this job failed. | [optional] -**clean_pod_policy** | **str** | CleanPodPolicy defines the policy to kill pods after the job completes. Default to Running. | [optional] +**clean_pod_policy** | **str** | CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. | [optional] **scheduling_policy** | [**V1SchedulingPolicy**](V1SchedulingPolicy.md) | | [optional] **ttl_seconds_after_finished** | **int** | TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite. | [optional] diff --git a/sdk/python/kubeflow/training/models/v1_run_policy.py b/sdk/python/kubeflow/training/models/v1_run_policy.py index f064879671..f98a8c14e3 100644 --- a/sdk/python/kubeflow/training/models/v1_run_policy.py +++ b/sdk/python/kubeflow/training/models/v1_run_policy.py @@ -122,7 +122,7 @@ def backoff_limit(self, backoff_limit): def clean_pod_policy(self): """Gets the clean_pod_policy of this V1RunPolicy. # noqa: E501 - CleanPodPolicy defines the policy to kill pods after the job completes. Default to Running. # noqa: E501 + CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. # noqa: E501 :return: The clean_pod_policy of this V1RunPolicy. # noqa: E501 :rtype: str @@ -133,7 +133,7 @@ def clean_pod_policy(self): def clean_pod_policy(self, clean_pod_policy): """Sets the clean_pod_policy of this V1RunPolicy. - CleanPodPolicy defines the policy to kill pods after the job completes. Default to Running. # noqa: E501 + CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. # noqa: E501 :param clean_pod_policy: The clean_pod_policy of this V1RunPolicy. # noqa: E501 :type: str