From 59cc98cbfc906546b096a29f5d31482fba7cdebf Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Wed, 5 Jul 2023 01:03:41 +0900 Subject: [PATCH] Remove duplicated imports for pkg/apis/kubeflow.org/v1 (#1847) Signed-off-by: Yuki Iwai --- pkg/apis/kubeflow.org/v1/common_types.go | 1 - pkg/common/util/reconciler.go | 15 +++-- pkg/common/util/reconciler_generic.go | 8 +-- pkg/common/util/reconciler_test.go | 7 +- pkg/common/util/scheduler.go | 6 +- pkg/common/util/util.go | 11 +-- pkg/common/util/util_test.go | 46 ++++++------- pkg/controller.v1/mpi/mpijob.go | 40 ++++++----- pkg/controller.v1/mpi/mpijob_controller.go | 53 +++++++-------- pkg/controller.v1/mxnet/mxjob_controller.go | 39 ++++++----- pkg/controller.v1/mxnet/mxnet.go | 7 +- pkg/controller.v1/paddlepaddle/envvar.go | 5 +- .../paddlepaddle/paddlepaddle_controller.go | 37 +++++----- .../paddlepaddle_controller_test.go | 7 +- pkg/controller.v1/pytorch/elastic_test.go | 5 +- pkg/controller.v1/pytorch/envvar.go | 5 +- .../pytorch/initcontainer_test.go | 9 ++- .../pytorch/pytorchjob_controller.go | 37 +++++----- .../pytorch/pytorchjob_controller_test.go | 11 ++- pkg/controller.v1/tensorflow/job_test.go | 17 +++-- pkg/controller.v1/tensorflow/pod_test.go | 31 +++++---- pkg/controller.v1/tensorflow/status_test.go | 51 +++++++------- pkg/controller.v1/tensorflow/testutil/pod.go | 12 ++-- .../tensorflow/testutil/service.go | 13 ++-- .../tensorflow/testutil/tfjob.go | 39 ++++++----- .../tensorflow/tfjob_controller.go | 67 +++++++++---------- .../tensorflow/tfjob_controller_test.go | 7 +- pkg/controller.v1/tensorflow/util.go | 19 +++--- pkg/controller.v1/tensorflow/util_test.go | 9 ++- pkg/controller.v1/xgboost/status.go | 10 +-- pkg/controller.v1/xgboost/status_test.go | 50 +++++++------- pkg/controller.v1/xgboost/xgboost.go | 9 ++- .../xgboost/xgboostjob_controller.go | 23 +++---- .../common/gang_scheduler_framework.go | 8 +-- pkg/reconciler.v1/common/gang_volcano.go | 8 +-- pkg/reconciler.v1/common/interface.go | 62 ++++++++--------- pkg/reconciler.v1/common/job.go | 63 +++++++++-------- pkg/reconciler.v1/common/pod.go | 24 +++---- pkg/reconciler.v1/common/pod_test.go | 8 +-- pkg/reconciler.v1/common/service.go | 16 ++--- pkg/reconciler.v1/common/service_test.go | 20 +++--- pkg/reconciler.v1/common/utils_test.go | 6 +- test_job/apis/test_job/v1/constants.go | 4 +- test_job/apis/test_job/v1/defaults.go | 9 +-- test_job/apis/test_job/v1/types.go | 10 +-- .../test_job/test_job_reconciler.go | 38 +++++------ 46 files changed, 481 insertions(+), 501 deletions(-) diff --git a/pkg/apis/kubeflow.org/v1/common_types.go b/pkg/apis/kubeflow.org/v1/common_types.go index f6f5e710b1..3f263a791b 100644 --- a/pkg/apis/kubeflow.org/v1/common_types.go +++ b/pkg/apis/kubeflow.org/v1/common_types.go @@ -21,7 +21,6 @@ import ( ) const ( - // ReplicaIndexLabel represents the label key for the replica-index, e.g. 0, 1, 2.. etc ReplicaIndexLabel = "training.kubeflow.org/replica-index" diff --git a/pkg/common/util/reconciler.go b/pkg/common/util/reconciler.go index faf91cbb53..350cbdde94 100644 --- a/pkg/common/util/reconciler.go +++ b/pkg/common/util/reconciler.go @@ -18,19 +18,20 @@ import ( "fmt" "reflect" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/controller.v1/common" - "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" - commonutil "github.com/kubeflow/training-operator/pkg/util" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/event" + + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/common" + "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" + commonutil "github.com/kubeflow/training-operator/pkg/util" ) // SatisfiedExpectations returns true if the required adds/dels for the given mxjob have been observed. // Add/del counts are established by the controller at sync time, and updated as controllees are observed by the controller // manager. -func SatisfiedExpectations(exp expectation.ControllerExpectationsInterface, jobKey string, replicaTypes []commonv1.ReplicaType) bool { +func SatisfiedExpectations(exp expectation.ControllerExpectationsInterface, jobKey string, replicaTypes []kubeflowv1.ReplicaType) bool { satisfied := false for _, rtype := range replicaTypes { // Check the expectations of the pods. @@ -47,7 +48,7 @@ func SatisfiedExpectations(exp expectation.ControllerExpectationsInterface, jobK // OnDependentCreateFunc modify expectations when dependent (pod/service) creation observed. func OnDependentCreateFunc(exp expectation.ControllerExpectationsInterface) func(event.CreateEvent) bool { return func(e event.CreateEvent) bool { - rtype := e.Object.GetLabels()[commonv1.ReplicaTypeLabel] + rtype := e.Object.GetLabels()[kubeflowv1.ReplicaTypeLabel] if len(rtype) == 0 { return false } @@ -145,7 +146,7 @@ func resolveControllerRef(jc *common.JobController, namespace string, controller func OnDependentDeleteFunc(exp expectation.ControllerExpectationsInterface) func(event.DeleteEvent) bool { return func(e event.DeleteEvent) bool { - rtype := e.Object.GetLabels()[commonv1.ReplicaTypeLabel] + rtype := e.Object.GetLabels()[kubeflowv1.ReplicaTypeLabel] if len(rtype) == 0 { return false } diff --git a/pkg/common/util/reconciler_generic.go b/pkg/common/util/reconciler_generic.go index 7433981912..4b3f737436 100644 --- a/pkg/common/util/reconciler_generic.go +++ b/pkg/common/util/reconciler_generic.go @@ -19,10 +19,10 @@ import ( "reflect" "strings" - "github.com/kubeflow/training-operator/pkg/controller.v1/common" log "github.com/sirupsen/logrus" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/common" "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/event" @@ -53,7 +53,7 @@ func LoggerForGenericKind(obj metav1.Object, kind string) *log.Entry { // OnDependentCreateFuncGeneric modify expectations when dependent (pod/service) creation observed. func OnDependentCreateFuncGeneric(exp expectation.ControllerExpectationsInterface) func(event.CreateEvent) bool { return func(e event.CreateEvent) bool { - rtype := e.Object.GetLabels()[commonv1.ReplicaTypeLabel] + rtype := e.Object.GetLabels()[kubeflowv1.ReplicaTypeLabel] if len(rtype) == 0 { return false } @@ -114,7 +114,7 @@ func OnDependentUpdateFuncGeneric(jc *common.JobController) func(updateEvent eve func OnDependentDeleteFuncGeneric(exp expectation.ControllerExpectationsInterface) func(event.DeleteEvent) bool { return func(e event.DeleteEvent) bool { - rtype := e.Object.GetLabels()[commonv1.ReplicaTypeLabel] + rtype := e.Object.GetLabels()[kubeflowv1.ReplicaTypeLabel] if len(rtype) == 0 { return false } diff --git a/pkg/common/util/reconciler_test.go b/pkg/common/util/reconciler_test.go index fcebd5d650..5442216889 100644 --- a/pkg/common/util/reconciler_test.go +++ b/pkg/common/util/reconciler_test.go @@ -3,12 +3,13 @@ package util import ( "testing" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/event" + + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" ) func TestOnDependentXXXFunc(t *testing.T) { @@ -24,7 +25,7 @@ func TestOnDependentXXXFunc(t *testing.T) { object: &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ - commonv1.ReplicaTypeLabel: "Worker", + kubeflowv1.ReplicaTypeLabel: "Worker", }, }, }, diff --git a/pkg/common/util/scheduler.go b/pkg/common/util/scheduler.go index 0a5ede1700..a7bbff1d82 100644 --- a/pkg/common/util/scheduler.go +++ b/pkg/common/util/scheduler.go @@ -14,9 +14,9 @@ package util -import commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" +import kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" -func IsGangSchedulerSet(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, schedulerName string) bool { +func IsGangSchedulerSet(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, schedulerName string) bool { for _, spec := range replicas { if spec.Template.Spec.SchedulerName != "" && spec.Template.Spec.SchedulerName == schedulerName { return true @@ -25,7 +25,7 @@ func IsGangSchedulerSet(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, return false } -func GetSchedulerName(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) string { +func GetSchedulerName(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) string { for _, spec := range replicas { if len(spec.Template.Spec.SchedulerName) > 0 { return spec.Template.Spec.SchedulerName diff --git a/pkg/common/util/util.go b/pkg/common/util/util.go index 2caea3a8bd..e0aa438e48 100644 --- a/pkg/common/util/util.go +++ b/pkg/common/util/util.go @@ -18,10 +18,11 @@ import ( "fmt" "time" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - commonutil "github.com/kubeflow/training-operator/pkg/util" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + commonutil "github.com/kubeflow/training-operator/pkg/util" ) type ObjectFilterFunction func(obj metav1.Object) bool @@ -53,8 +54,8 @@ func JobControlledPodList(list []corev1.Pod, job metav1.Object) []*corev1.Pod { return ret } -func GetReplicaTypes(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) []commonv1.ReplicaType { - keys := make([]commonv1.ReplicaType, 0, len(specs)) +func GetReplicaTypes(specs map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) []kubeflowv1.ReplicaType { + keys := make([]kubeflowv1.ReplicaType, 0, len(specs)) for k := range specs { keys = append(keys, k) } @@ -62,7 +63,7 @@ func GetReplicaTypes(specs map[commonv1.ReplicaType]*commonv1.ReplicaSpec) []com } // DurationUntilExpireTime returns the duration until job needs to be cleaned up, or -1 if it's infinite. -func DurationUntilExpireTime(runPolicy *commonv1.RunPolicy, jobStatus commonv1.JobStatus) (time.Duration, error) { +func DurationUntilExpireTime(runPolicy *kubeflowv1.RunPolicy, jobStatus kubeflowv1.JobStatus) (time.Duration, error) { if !commonutil.IsSucceeded(jobStatus) && !commonutil.IsFailed(jobStatus) { return -1, nil } diff --git a/pkg/common/util/util_test.go b/pkg/common/util/util_test.go index 1fe09125c6..556e5210b2 100644 --- a/pkg/common/util/util_test.go +++ b/pkg/common/util/util_test.go @@ -22,33 +22,33 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/pointer" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestDurationUntilExpireTime(t *testing.T) { tests := []struct { name string - runPolicy *commonv1.RunPolicy - jobStatus commonv1.JobStatus + runPolicy *kubeflowv1.RunPolicy + jobStatus kubeflowv1.JobStatus want time.Duration wantErr bool }{ { name: "running job", - runPolicy: &commonv1.RunPolicy{}, - jobStatus: commonv1.JobStatus{ - Conditions: []commonv1.JobCondition{newJobCondition(commonv1.JobRunning)}, + runPolicy: &kubeflowv1.RunPolicy{}, + jobStatus: kubeflowv1.JobStatus{ + Conditions: []kubeflowv1.JobCondition{newJobCondition(kubeflowv1.JobRunning)}, }, want: -1, wantErr: false, }, { name: "succeeded job with remaining time 1s", - runPolicy: &commonv1.RunPolicy{ + runPolicy: &kubeflowv1.RunPolicy{ TTLSecondsAfterFinished: pointer.Int32(5), }, - jobStatus: commonv1.JobStatus{ - Conditions: []commonv1.JobCondition{newJobCondition(commonv1.JobSucceeded)}, + jobStatus: kubeflowv1.JobStatus{ + Conditions: []kubeflowv1.JobCondition{newJobCondition(kubeflowv1.JobSucceeded)}, CompletionTime: &metav1.Time{Time: time.Now().Add(4 * time.Second)}, }, want: 1, @@ -56,11 +56,11 @@ func TestDurationUntilExpireTime(t *testing.T) { }, { name: "failed job with remaining time 1s", - runPolicy: &commonv1.RunPolicy{ + runPolicy: &kubeflowv1.RunPolicy{ TTLSecondsAfterFinished: pointer.Int32(5), }, - jobStatus: commonv1.JobStatus{ - Conditions: []commonv1.JobCondition{newJobCondition(commonv1.JobFailed)}, + jobStatus: kubeflowv1.JobStatus{ + Conditions: []kubeflowv1.JobCondition{newJobCondition(kubeflowv1.JobFailed)}, CompletionTime: &metav1.Time{Time: time.Now().Add(4 * time.Second)}, }, want: 1, @@ -68,9 +68,9 @@ func TestDurationUntilExpireTime(t *testing.T) { }, { name: "succeeded job with infinite TTL", - runPolicy: &commonv1.RunPolicy{}, - jobStatus: commonv1.JobStatus{ - Conditions: []commonv1.JobCondition{newJobCondition(commonv1.JobSucceeded)}, + runPolicy: &kubeflowv1.RunPolicy{}, + jobStatus: kubeflowv1.JobStatus{ + Conditions: []kubeflowv1.JobCondition{newJobCondition(kubeflowv1.JobSucceeded)}, CompletionTime: &metav1.Time{Time: time.Now().Add(4 * time.Second)}, }, want: -1, @@ -78,11 +78,11 @@ func TestDurationUntilExpireTime(t *testing.T) { }, { name: "succeeded job without remaining time", - runPolicy: &commonv1.RunPolicy{ + runPolicy: &kubeflowv1.RunPolicy{ TTLSecondsAfterFinished: pointer.Int32(5), }, - jobStatus: commonv1.JobStatus{ - Conditions: []commonv1.JobCondition{newJobCondition(commonv1.JobSucceeded)}, + jobStatus: kubeflowv1.JobStatus{ + Conditions: []kubeflowv1.JobCondition{newJobCondition(kubeflowv1.JobSucceeded)}, CompletionTime: &metav1.Time{Time: time.Now().Add(6 * time.Second)}, }, want: 0, @@ -90,11 +90,11 @@ func TestDurationUntilExpireTime(t *testing.T) { }, { name: "succeeded job with nil completion time error", - runPolicy: &commonv1.RunPolicy{ + runPolicy: &kubeflowv1.RunPolicy{ TTLSecondsAfterFinished: pointer.Int32(5), }, - jobStatus: commonv1.JobStatus{ - Conditions: []commonv1.JobCondition{newJobCondition(commonv1.JobSucceeded)}, + jobStatus: kubeflowv1.JobStatus{ + Conditions: []kubeflowv1.JobCondition{newJobCondition(kubeflowv1.JobSucceeded)}, }, want: -1, wantErr: true, @@ -116,8 +116,8 @@ func TestDurationUntilExpireTime(t *testing.T) { } } -func newJobCondition(t commonv1.JobConditionType) commonv1.JobCondition { - return commonv1.JobCondition{ +func newJobCondition(t kubeflowv1.JobConditionType) kubeflowv1.JobCondition { + return kubeflowv1.JobCondition{ Type: t, Status: corev1.ConditionTrue, } diff --git a/pkg/controller.v1/mpi/mpijob.go b/pkg/controller.v1/mpi/mpijob.go index c3cb459b81..86d67bf6d5 100644 --- a/pkg/controller.v1/mpi/mpijob.go +++ b/pkg/controller.v1/mpi/mpijob.go @@ -17,7 +17,6 @@ package mpi import ( "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" corev1 "k8s.io/api/core/v1" @@ -89,25 +88,24 @@ const ( ) // initializeMPIJobStatuses initializes the ReplicaStatuses for MPIJob. -func initializeMPIJobStatuses(mpiJob *kubeflowv1.MPIJob, mtype commonv1.ReplicaType) { - replicaType := commonv1.ReplicaType(mtype) +func initializeMPIJobStatuses(mpiJob *kubeflowv1.MPIJob, rType kubeflowv1.ReplicaType) { if mpiJob.Status.ReplicaStatuses == nil { - mpiJob.Status.ReplicaStatuses = make(map[commonv1.ReplicaType]*commonv1.ReplicaStatus) + mpiJob.Status.ReplicaStatuses = make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus) } - mpiJob.Status.ReplicaStatuses[replicaType] = &commonv1.ReplicaStatus{} + mpiJob.Status.ReplicaStatuses[rType] = &kubeflowv1.ReplicaStatus{} } // updateMPIJobConditions updates the conditions of the given mpiJob. -func updateMPIJobConditions(mpiJob *kubeflowv1.MPIJob, conditionType commonv1.JobConditionType, reason, message string) error { +func updateMPIJobConditions(mpiJob *kubeflowv1.MPIJob, conditionType kubeflowv1.JobConditionType, reason, message string) error { condition := newCondition(conditionType, reason, message) setCondition(&mpiJob.Status, condition) return nil } // newCondition creates a new mpiJob condition. -func newCondition(conditionType commonv1.JobConditionType, reason, message string) commonv1.JobCondition { - return commonv1.JobCondition{ +func newCondition(conditionType kubeflowv1.JobConditionType, reason, message string) kubeflowv1.JobCondition { + return kubeflowv1.JobCondition{ Type: conditionType, Status: corev1.ConditionTrue, LastUpdateTime: metav1.Now(), @@ -118,7 +116,7 @@ func newCondition(conditionType commonv1.JobConditionType, reason, message strin } // getCondition returns the condition with the provided type. -func getCondition(status commonv1.JobStatus, condType commonv1.JobConditionType) *commonv1.JobCondition { +func getCondition(status kubeflowv1.JobStatus, condType kubeflowv1.JobConditionType) *kubeflowv1.JobCondition { for _, condition := range status.Conditions { if condition.Type == condType { return &condition @@ -127,9 +125,9 @@ func getCondition(status commonv1.JobStatus, condType commonv1.JobConditionType) return nil } -func isEvicted(status commonv1.JobStatus) bool { +func isEvicted(status kubeflowv1.JobStatus) bool { for _, condition := range status.Conditions { - if condition.Type == commonv1.JobFailed && + if condition.Type == kubeflowv1.JobFailed && condition.Status == corev1.ConditionTrue && condition.Reason == mpiJobEvict { return true @@ -141,7 +139,7 @@ func isEvicted(status commonv1.JobStatus) bool { // setCondition updates the mpiJob to include the provided condition. // If the condition that we are about to add already exists // and has the same status and reason then we are not going to update. -func setCondition(status *commonv1.JobStatus, condition commonv1.JobCondition) { +func setCondition(status *kubeflowv1.JobStatus, condition kubeflowv1.JobCondition) { currentCond := getCondition(*status, condition.Type) @@ -161,13 +159,13 @@ func setCondition(status *commonv1.JobStatus, condition commonv1.JobCondition) { } // filterOutCondition returns a new slice of mpiJob conditions without conditions with the provided type. -func filterOutCondition(conditions []commonv1.JobCondition, condType commonv1.JobConditionType) []commonv1.JobCondition { - var newConditions []commonv1.JobCondition +func filterOutCondition(conditions []kubeflowv1.JobCondition, condType kubeflowv1.JobConditionType) []kubeflowv1.JobCondition { + var newConditions []kubeflowv1.JobCondition for _, c := range conditions { - if condType == commonv1.JobRestarting && c.Type == commonv1.JobRunning { + if condType == kubeflowv1.JobRestarting && c.Type == kubeflowv1.JobRunning { continue } - if condType == commonv1.JobRunning && c.Type == commonv1.JobRestarting { + if condType == kubeflowv1.JobRunning && c.Type == kubeflowv1.JobRestarting { continue } @@ -176,7 +174,7 @@ func filterOutCondition(conditions []commonv1.JobCondition, condType commonv1.Jo } // Set the running condition status to be false when current condition failed or succeeded - if (condType == commonv1.JobFailed || condType == commonv1.JobSucceeded) && (c.Type == commonv1.JobRunning || c.Type == commonv1.JobFailed) { + if (condType == kubeflowv1.JobFailed || condType == kubeflowv1.JobSucceeded) && (c.Type == kubeflowv1.JobRunning || c.Type == kubeflowv1.JobFailed) { c.Status = corev1.ConditionFalse } @@ -242,7 +240,7 @@ func defaultReplicaLabels(genericLabels map[string]string, roleLabelVal string) replicaLabels[k] = v } - replicaLabels[commonv1.ReplicaTypeLabel] = roleLabelVal + replicaLabels[kubeflowv1.ReplicaTypeLabel] = roleLabelVal return replicaLabels } @@ -271,10 +269,10 @@ func workerSelector(genericLabels map[string]string) (labels.Selector, error) { // initializeReplicaStatuses initializes the ReplicaStatuses for replica. // originally from pkg/controller.v1/tensorflow/status.go (deleted) -func initializeReplicaStatuses(jobStatus *commonv1.JobStatus, rtype commonv1.ReplicaType) { +func initializeReplicaStatuses(jobStatus *kubeflowv1.JobStatus, rtype kubeflowv1.ReplicaType) { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = make(map[commonv1.ReplicaType]*commonv1.ReplicaStatus) + jobStatus.ReplicaStatuses = make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus) } - jobStatus.ReplicaStatuses[rtype] = &commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses[rtype] = &kubeflowv1.ReplicaStatus{} } diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index f1f03787f8..e3a349a3f3 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -50,7 +50,6 @@ import ( schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" "github.com/kubeflow/training-operator/pkg/common/util" @@ -257,8 +256,8 @@ func (jc *MPIJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads func (jc *MPIJobReconciler) ReconcileServices( job metav1.Object, services []*corev1.Service, - rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec) error { + rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec) error { return nil } @@ -291,8 +290,8 @@ func (jc *MPIJobReconciler) GetDefaultContainerPortName() string { return kubeflowv1.MPIJobDefaultPortName } -func (jc *MPIJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - rtype commonv1.ReplicaType, index int) bool { +func (jc *MPIJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, index int) bool { return string(rtype) == string(kubeflowv1.MPIJobReplicaTypeLauncher) } @@ -316,7 +315,7 @@ func (jc *MPIJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { msg := fmt.Sprintf("MPIJob %s/%s is created.", mpiJob.Namespace, e.Object.GetName()) logrus.Info(msg) trainingoperatorcommon.CreatedJobsCounterInc(mpiJob.Namespace, kubeflowv1.MPIJobFrameworkName) - if err := commonutil.UpdateJobConditions(&mpiJob.Status, commonv1.JobCreated, mpiJobCreatedReason, msg); err != nil { + if err := commonutil.UpdateJobConditions(&mpiJob.Status, kubeflowv1.JobCreated, mpiJobCreatedReason, msg); err != nil { log.Log.Error(err, "append job condition error") return false } @@ -326,11 +325,11 @@ func (jc *MPIJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { func (jc *MPIJobReconciler) ReconcilePods( job interface{}, - jobStatus *commonv1.JobStatus, + jobStatus *kubeflowv1.JobStatus, pods []*corev1.Pod, - rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, ) error { mpiJob, ok := job.(*kubeflowv1.MPIJob) @@ -420,7 +419,7 @@ func (jc *MPIJobReconciler) updateMPIJobStatus(mpiJob *kubeflowv1.MPIJob, launch now := metav1.Now() mpiJob.Status.CompletionTime = &now } - err := updateMPIJobConditions(mpiJob, commonv1.JobSucceeded, mpiJobSucceededReason, msg) + err := updateMPIJobConditions(mpiJob, kubeflowv1.JobSucceeded, mpiJobSucceededReason, msg) if err != nil { return err } @@ -438,7 +437,7 @@ func (jc *MPIJobReconciler) updateMPIJobStatus(mpiJob *kubeflowv1.MPIJob, launch now := metav1.Now() mpiJob.Status.CompletionTime = &now } - err := updateMPIJobConditions(mpiJob, commonv1.JobFailed, reason, msg) + err := updateMPIJobConditions(mpiJob, kubeflowv1.JobFailed, reason, msg) if err != nil { klog.Errorf("Append mpiJob(%s/%s) condition error: %v", mpiJob.Namespace, mpiJob.Name, err) return err @@ -471,7 +470,7 @@ func (jc *MPIJobReconciler) updateMPIJobStatus(mpiJob *kubeflowv1.MPIJob, launch } if evict > 0 { msg := fmt.Sprintf("%d/%d workers are evicted", evict, len(worker)) - if err := updateMPIJobConditions(mpiJob, commonv1.JobFailed, mpiJobEvict, msg); err != nil { + if err := updateMPIJobConditions(mpiJob, kubeflowv1.JobFailed, mpiJobEvict, msg); err != nil { return err } jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, mpiJobEvict, msg) @@ -479,7 +478,7 @@ func (jc *MPIJobReconciler) updateMPIJobStatus(mpiJob *kubeflowv1.MPIJob, launch if launcher != nil && launcher.Status.Phase == corev1.PodRunning && running == len(worker) { msg := fmt.Sprintf("MPIJob %s/%s is running.", mpiJob.Namespace, mpiJob.Name) - err := updateMPIJobConditions(mpiJob, commonv1.JobRunning, mpiJobRunningReason, msg) + err := updateMPIJobConditions(mpiJob, kubeflowv1.JobRunning, mpiJobRunningReason, msg) if err != nil { return err } @@ -558,7 +557,7 @@ func (jc *MPIJobReconciler) GetServicesForJob(jobObject interface{}) ([]*corev1. return nil, nil } -func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error { +func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error { mpiJob, ok := job.(*kubeflowv1.MPIJob) if !ok { return fmt.Errorf("%+v is not a type of MPIJob", job) @@ -578,7 +577,7 @@ func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[common if rtype == kubeflowv1.MPIJobReplicaTypeLauncher { if running > 0 { msg := fmt.Sprintf("MPIJob %s is running.", mpiJob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, commonutil.JobRunningReason, msg) if err != nil { commonutil.LoggerForJob(mpiJob).Infof("Append job condition error: %v", err) return err @@ -593,7 +592,7 @@ func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[common now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobSucceeded, commonutil.JobSucceededReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, commonutil.JobSucceededReason, msg) if err != nil { commonutil.LoggerForJob(mpiJob).Infof("Append job condition error: %v", err) return err @@ -603,10 +602,10 @@ func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[common } } if failed > 0 { - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { msg := fmt.Sprintf("MPIJob %s is restarting because %d %s replica(s) failed.", mpiJob.Name, failed, rtype) jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, commonutil.JobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, commonutil.JobRestartingReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, commonutil.JobRestartingReason, msg) if err != nil { commonutil.LoggerForJob(mpiJob).Infof("Append job condition error: %v", err) return err @@ -619,7 +618,7 @@ func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[common now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobFailed, commonutil.JobFailedReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, commonutil.JobFailedReason, msg) if err != nil { commonutil.LoggerForJob(mpiJob).Infof("Append job condition error: %v", err) return err @@ -632,9 +631,9 @@ func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[common return nil } -func (jc *MPIJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { +func (jc *MPIJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = map[commonv1.ReplicaType]*commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} } mpiJob, ok := job.(*kubeflowv1.MPIJob) @@ -868,7 +867,7 @@ func (jc *MPIJobReconciler) getOrCreateWorker(mpiJob *kubeflowv1.MPIJob) ([]*cor } if len(podlist.Items) > int(*workerReplicas) { for _, pod := range podlist.Items { - indexStr, ok := pod.Labels[commonv1.ReplicaIndexLabel] + indexStr, ok := pod.Labels[kubeflowv1.ReplicaIndexLabel] if !ok { return nil, err } @@ -901,7 +900,7 @@ func (jc *MPIJobReconciler) getOrCreateWorker(mpiJob *kubeflowv1.MPIJob) ([]*cor return nil, err } // Insert ReplicaIndexLabel - worker.Labels[commonv1.ReplicaIndexLabel] = strconv.Itoa(int(i)) + worker.Labels[kubeflowv1.ReplicaIndexLabel] = strconv.Itoa(int(i)) pod, err = jc.KubeClientSet.CoreV1().Pods(mpiJob.Namespace).Create(context.Background(), worker, metav1.CreateOptions{}) if err == nil { jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "SuccessfulCreatePod", "Created worker pod: %v", pod.Name) @@ -1025,7 +1024,7 @@ func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDelive masterRole := jc.IsMasterRole(mpiJob.Spec.MPIReplicaSpecs, kubeflowv1.MPIJobReplicaTypeLauncher, 0) if masterRole { - labels[commonv1.JobRoleLabel] = "master" + labels[kubeflowv1.JobRoleLabel] = "master" } podSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.DeepCopy() // copy the labels and annotations to pod from PodTemplate @@ -1391,8 +1390,8 @@ func newLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) *rbacv1.RoleBinding { } } -func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *commonv1.ReplicaSpec) { - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { +func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *kubeflowv1.ReplicaSpec) { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicyNever } else { podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy) diff --git a/pkg/controller.v1/mxnet/mxjob_controller.go b/pkg/controller.v1/mxnet/mxjob_controller.go index 17be567738..82920d002d 100644 --- a/pkg/controller.v1/mxnet/mxjob_controller.go +++ b/pkg/controller.v1/mxnet/mxjob_controller.go @@ -20,7 +20,6 @@ import ( "reflect" "time" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" "github.com/kubeflow/training-operator/pkg/common/util" @@ -156,10 +155,10 @@ func (r *MXJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // Set default priorities to mxnet job r.Scheme.Default(mxjob) - // Convert MX.Spec.MXReplicasSpecs to map[commonv1.ReplicaType]*commonv1.ReplicaSpec - replicas := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{} + // Convert MX.Spec.MXReplicasSpecs to map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec + replicas := map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{} for k, v := range mxjob.Spec.MXReplicaSpecs { - replicas[commonv1.ReplicaType(k)] = v + replicas[k] = v } // Use common to reconcile the job related pod and service @@ -336,7 +335,7 @@ func (r *MXJobReconciler) DeleteJob(job interface{}) error { return nil } -func (r *MXJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error { +func (r *MXJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error { mxjob, ok := job.(*kubeflowv1.MXJob) if !ok { return fmt.Errorf("%v is not a type of MXJob", mxjob) @@ -373,10 +372,10 @@ func (r *MXJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 r.Log.Info(fmt.Sprintf("MXJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d, failed=%d, singleTraining=%t", mxjob.Name, rtype, expected, running, succeeded, failed, singleTraining)) - if rtype == commonv1.ReplicaType(kubeflowv1.MXJobReplicaTypeScheduler) || singleTraining { + if rtype == kubeflowv1.MXJobReplicaTypeScheduler || singleTraining { if running > 0 { msg := fmt.Sprintf("MXJob %s is running.", mxjob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, mxJobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, mxJobRunningReason, msg) if err != nil { logrus.Infof("Append mxjob condition error: %v", err) return err @@ -390,7 +389,7 @@ func (r *MXJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobSucceeded, mxJobSucceededReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, mxJobSucceededReason, msg) if err != nil { logrus.Infof("Append mxjob condition error: %v", err) return err @@ -400,10 +399,10 @@ func (r *MXJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 } } if failed > 0 { - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { msg := fmt.Sprintf("mxjob %s is restarting because %d %s replica(s) failed.", mxjob.Name, failed, rtype) r.Recorder.Event(mxjob, corev1.EventTypeWarning, mxJobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, mxJobRestartingReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, mxJobRestartingReason, msg) if err != nil { logrus.Infof("Append job condition error: %v", err) return err @@ -416,7 +415,7 @@ func (r *MXJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobFailed, mxJobFailedReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, mxJobFailedReason, msg) if err != nil { logrus.Infof("Append job condition error: %v", err) return err @@ -430,9 +429,9 @@ func (r *MXJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 } // UpdateJobStatusInApiServer updates the status of the given MXJob. -func (r *MXJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { +func (r *MXJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = map[commonv1.ReplicaType]*commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} } mxJob, ok := job.(*kubeflowv1.MXJob) @@ -465,8 +464,8 @@ func (r *MXJobReconciler) GetDefaultContainerPortName() string { return kubeflowv1.MXJobDefaultPortName } -func (r *MXJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - rtype commonv1.ReplicaType, index int) bool { +func (r *MXJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, index int) bool { return string(rtype) == string(kubeflowv1.MXJobReplicaTypeServer) } @@ -483,7 +482,7 @@ func (r *MXJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { msg := fmt.Sprintf("MXJob %s is created.", e.Object.GetName()) logrus.Info(msg) trainingoperatorcommon.CreatedJobsCounterInc(mxJob.Namespace, kubeflowv1.MXJobFrameworkName) - if err := commonutil.UpdateJobConditions(&mxJob.Status, commonv1.JobCreated, "MXJobCreated", msg); err != nil { + if err := commonutil.UpdateJobConditions(&mxJob.Status, kubeflowv1.JobCreated, "MXJobCreated", msg); err != nil { logrus.Error(err, "append job condition error") return false } @@ -491,15 +490,15 @@ func (r *MXJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { } } -func (r *MXJobReconciler) isSingleWorker(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool { +func (r *MXJobReconciler) isSingleWorker(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool { var workerNum, scheNum, svrNum int32 = 0, 0, 0 for rtype, spec := range replicas { - if rtype == commonv1.ReplicaType(kubeflowv1.MXJobReplicaTypeScheduler) { + if rtype == kubeflowv1.MXJobReplicaTypeScheduler { scheNum += *spec.Replicas - } else if rtype == commonv1.ReplicaType(kubeflowv1.MXJobReplicaTypeServer) { + } else if rtype == kubeflowv1.MXJobReplicaTypeServer { svrNum += *spec.Replicas - } else if rtype == commonv1.ReplicaType(kubeflowv1.MXJobReplicaTypeWorker) { + } else if rtype == kubeflowv1.MXJobReplicaTypeWorker { workerNum += *spec.Replicas } } diff --git a/pkg/controller.v1/mxnet/mxnet.go b/pkg/controller.v1/mxnet/mxnet.go index 6aaf1c843a..4693a7bbf9 100644 --- a/pkg/controller.v1/mxnet/mxnet.go +++ b/pkg/controller.v1/mxnet/mxnet.go @@ -20,7 +20,6 @@ import ( "strconv" "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/pkg/controller.v1/common" @@ -202,7 +201,7 @@ func genLabelsSpec(mxjob *kubeflowv1.MXJob) (LabelsSpec, error) { return labelsSpec, nil } -func getConfigAddr(mxConfigData *MXConfig, rtype commonv1.ReplicaType, index int) UrlPort { +func getConfigAddr(mxConfigData *MXConfig, rtype kubeflowv1.ReplicaType, index int) UrlPort { rt := strings.ToLower(string(rtype)) var urlPort UrlPort if len(mxConfigData.Cluster[rt]) <= index { @@ -217,13 +216,13 @@ func getConfigAddr(mxConfigData *MXConfig, rtype commonv1.ReplicaType, index int return urlPort } -func getConfigReplica(mxConfigData *MXConfig, rtype commonv1.ReplicaType) int { +func getConfigReplica(mxConfigData *MXConfig, rtype kubeflowv1.ReplicaType) int { rt := strings.ToLower(string(rtype)) return len(mxConfigData.Cluster[rt]) } // getPortFromMXJob gets the port of mxnet container. -func getPortFromMXJob(mxJob *kubeflowv1.MXJob, rtype commonv1.ReplicaType) (int32, error) { +func getPortFromMXJob(mxJob *kubeflowv1.MXJob, rtype kubeflowv1.ReplicaType) (int32, error) { containers := mxJob.Spec.MXReplicaSpecs[rtype].Template.Spec.Containers for _, container := range containers { if container.Name == kubeflowv1.MXJobDefaultContainerName { diff --git a/pkg/controller.v1/paddlepaddle/envvar.go b/pkg/controller.v1/paddlepaddle/envvar.go index 853e1da8f7..ff5c29a592 100644 --- a/pkg/controller.v1/paddlepaddle/envvar.go +++ b/pkg/controller.v1/paddlepaddle/envvar.go @@ -19,7 +19,6 @@ import ( "strconv" "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" corev1 "k8s.io/api/core/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" @@ -152,12 +151,12 @@ func getTotalReplicas(job *kubeflowv1.PaddleJob) int32 { return jobReplicas } -func replicaName(jobName string, rtype commonv1.ReplicaType, index int) string { +func replicaName(jobName string, rtype kubeflowv1.ReplicaType, index int) string { n := jobName + "-" + strings.ToLower(string(rtype)) + "-" + strconv.Itoa(index) return strings.Replace(n, "/", "-", -1) } -func getPortFromPaddleJob(job *kubeflowv1.PaddleJob, rtype commonv1.ReplicaType) int32 { +func getPortFromPaddleJob(job *kubeflowv1.PaddleJob, rtype kubeflowv1.ReplicaType) int32 { containers := job.Spec.PaddleReplicaSpecs[rtype].Template.Spec.Containers for _, container := range containers { if container.Name == kubeflowv1.PaddleJobDefaultContainerName { diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go index ebd7654342..c26f8a71c8 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go @@ -20,7 +20,6 @@ import ( "strings" "time" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" "github.com/kubeflow/training-operator/pkg/common/util" @@ -334,9 +333,9 @@ func (r *PaddleJobReconciler) DeleteJob(job interface{}) error { } func (jc *PaddleJobReconciler) GenLabelSelector(jobName string, - rtype commonv1.ReplicaType) *metav1.LabelSelector { + rtype kubeflowv1.ReplicaType) *metav1.LabelSelector { labels := jc.GenLabels(jobName) - labels[commonv1.ReplicaTypeLabel] = strings.ToLower(string(rtype)) + labels[kubeflowv1.ReplicaTypeLabel] = strings.ToLower(string(rtype)) return &metav1.LabelSelector{ MatchLabels: labels, @@ -345,8 +344,8 @@ func (jc *PaddleJobReconciler) GenLabelSelector(jobName string, // UpdateJobStatus updates the job status and job conditions func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - jobStatus *commonv1.JobStatus) error { + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + jobStatus *kubeflowv1.JobStatus) error { paddlejob, ok := job.(*kubeflowv1.PaddleJob) if !ok { return fmt.Errorf("%+v is not a type of PaddleJob", job) @@ -386,10 +385,10 @@ func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, paddlejob.Name, rtype, expected, running, succeeded, failed, specReplicas) if ContainsMasterSpec(replicas) { - if rtype == commonv1.ReplicaType(kubeflowv1.PaddleJobReplicaTypeMaster) { + if rtype == kubeflowv1.PaddleJobReplicaTypeMaster { if running > 0 { msg := fmt.Sprintf("PaddleJob %s is running.", paddlejob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, commonutil.JobRunningReason, msg) if err != nil { commonutil.LoggerForJob(paddlejob).Infof("Append job condition error: %v", err) return err @@ -404,7 +403,7 @@ func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobSucceeded, commonutil.JobSucceededReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, commonutil.JobSucceededReason, msg) if err != nil { commonutil.LoggerForJob(paddlejob).Infof("Append job condition error: %v", err) return err @@ -425,7 +424,7 @@ func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, jobStatus.CompletionTime = &now } err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobSucceeded, commonutil.JobSucceededReason, msg) + kubeflowv1.JobSucceeded, commonutil.JobSucceededReason, msg) if err != nil { commonutil.LoggerForJob(paddlejob).Infof("Append paddlejob condition error: %v", err) return err @@ -435,7 +434,7 @@ func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, // Some workers are still running, leave a running condition. msg := fmt.Sprintf("PaddleJob %s/%s is running.", paddlejob.Namespace, paddlejob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, commonutil.JobRunningReason, msg) if err != nil { commonutil.LoggerForJob(paddlejob).Infof("Append paddlejob condition error: %v", err) return err @@ -445,10 +444,10 @@ func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, } if failed > 0 && (specReplicas > succeeded+running) { - if spec.RestartPolicy != commonv1.RestartPolicyNever { + if spec.RestartPolicy != kubeflowv1.RestartPolicyNever { msg := fmt.Sprintf("PaddleJob %s is restarting because %d %s replica(s) failed.", paddlejob.Name, failed, rtype) r.Recorder.Event(paddlejob, corev1.EventTypeWarning, commonutil.JobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, commonutil.JobRestartingReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, commonutil.JobRestartingReason, msg) if err != nil { commonutil.LoggerForJob(paddlejob).Infof("Append job condition error: %v", err) return err @@ -461,7 +460,7 @@ func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobFailed, commonutil.JobFailedReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, commonutil.JobFailedReason, msg) if err != nil { commonutil.LoggerForJob(paddlejob).Infof("Append job condition error: %v", err) return err @@ -475,7 +474,7 @@ func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, } // ContainsMasterSpec returns true if the paddlejob contains master spec. -func ContainsMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool { +func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool { if _, ok := replicas[kubeflowv1.PaddleJobReplicaTypeMaster]; ok { return true } @@ -483,9 +482,9 @@ func ContainsMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) } // UpdateJobStatusInApiServer updates the job status in to cluster. -func (r *PaddleJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { +func (r *PaddleJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = map[commonv1.ReplicaType]*commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} } paddlejob, ok := job.(*kubeflowv1.PaddleJob) @@ -530,8 +529,8 @@ func (r *PaddleJobReconciler) GetDefaultContainerPortName() string { return kubeflowv1.PaddleJobDefaultPortName } -func (r *PaddleJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - rtype commonv1.ReplicaType, index int) bool { +func (r *PaddleJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, index int) bool { return string(rtype) == string(kubeflowv1.PaddleJobReplicaTypeMaster) } @@ -546,7 +545,7 @@ func (r *PaddleJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { msg := fmt.Sprintf("PaddleJob %s is created.", e.Object.GetName()) logrus.Info(msg) trainingoperatorcommon.CreatedJobsCounterInc(paddlejob.Namespace, kubeflowv1.PaddleJobFrameworkName) - if err := commonutil.UpdateJobConditions(&paddlejob.Status, commonv1.JobCreated, "PaddleJobCreated", msg); err != nil { + if err := commonutil.UpdateJobConditions(&paddlejob.Status, kubeflowv1.JobCreated, "PaddleJobCreated", msg); err != nil { logrus.Error(err, "append job condition error") return false } diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go index 77b65406a7..3cd82d3424 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go @@ -25,7 +25,6 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/utils/pointer" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -45,7 +44,7 @@ var _ = Describe("PaddleJob controller", func() { By("By creating a new PaddleJob") ctx := context.Background() job := newPaddleJobForTest(name, namespace) - job.Spec.PaddleReplicaSpecs = map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + job.Spec.PaddleReplicaSpecs = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PaddleJobReplicaTypeMaster: { Replicas: pointer.Int32(1), Template: corev1.PodTemplateSpec{ @@ -156,7 +155,7 @@ var _ = Describe("PaddleJob controller", func() { ReplicaStatuses[kubeflowv1.PaddleJobReplicaTypeMaster].Succeeded == 1 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) // Check if the job is succeeded. - cond := getCondition(created.Status, commonv1.JobSucceeded) + cond := getCondition(created.Status, kubeflowv1.JobSucceeded) Expect(cond.Status).To(Equal(corev1.ConditionTrue)) By("Deleting the PaddleJob") Expect(testK8sClient.Delete(ctx, job)).Should(Succeed()) @@ -174,7 +173,7 @@ func newPaddleJobForTest(name, namespace string) *kubeflowv1.PaddleJob { } // getCondition returns the condition with the provided type. -func getCondition(status commonv1.JobStatus, condType commonv1.JobConditionType) *commonv1.JobCondition { +func getCondition(status kubeflowv1.JobStatus, condType kubeflowv1.JobConditionType) *kubeflowv1.JobCondition { for _, condition := range status.Conditions { if condition.Type == condType { return &condition diff --git a/pkg/controller.v1/pytorch/elastic_test.go b/pkg/controller.v1/pytorch/elastic_test.go index 295b0bfbb9..fdec37f869 100644 --- a/pkg/controller.v1/pytorch/elastic_test.go +++ b/pkg/controller.v1/pytorch/elastic_test.go @@ -17,7 +17,6 @@ package pytorch import ( "testing" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -42,7 +41,7 @@ func TestElasticGenerate(t *testing.T) { name: "Without ElasticPolicy", job: &kubeflowv1.PyTorchJob{ Spec: kubeflowv1.PyTorchJobSpec{ - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchReplicaSpecs: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PyTorchJobReplicaTypeWorker: { Replicas: pointer.Int32(1), }, @@ -76,7 +75,7 @@ func TestElasticGenerate(t *testing.T) { NProcPerNode: pointer.Int32(1), MaxRestarts: pointer.Int32(3), }, - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchReplicaSpecs: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PyTorchJobReplicaTypeWorker: { Replicas: pointer.Int32(1), }, diff --git a/pkg/controller.v1/pytorch/envvar.go b/pkg/controller.v1/pytorch/envvar.go index 1497444ec1..d3ff5880a7 100644 --- a/pkg/controller.v1/pytorch/envvar.go +++ b/pkg/controller.v1/pytorch/envvar.go @@ -19,7 +19,6 @@ import ( "strconv" "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" corev1 "k8s.io/api/core/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" @@ -103,12 +102,12 @@ func getTotalReplicas(job *kubeflowv1.PyTorchJob) int32 { return jobReplicas } -func replicaName(jobName string, rtype commonv1.ReplicaType, index int) string { +func replicaName(jobName string, rtype kubeflowv1.ReplicaType, index int) string { n := jobName + "-" + strings.ToLower(string(rtype)) + "-" + strconv.Itoa(index) return strings.Replace(n, "/", "-", -1) } -func getPortFromPyTorchJob(job *kubeflowv1.PyTorchJob, rtype commonv1.ReplicaType) (int32, error) { +func getPortFromPyTorchJob(job *kubeflowv1.PyTorchJob, rtype kubeflowv1.ReplicaType) (int32, error) { containers := job.Spec.PyTorchReplicaSpecs[rtype].Template.Spec.Containers for _, container := range containers { if container.Name == kubeflowv1.PytorchJobDefaultContainerName { diff --git a/pkg/controller.v1/pytorch/initcontainer_test.go b/pkg/controller.v1/pytorch/initcontainer_test.go index fe4b9884b6..0a67c69d9f 100644 --- a/pkg/controller.v1/pytorch/initcontainer_test.go +++ b/pkg/controller.v1/pytorch/initcontainer_test.go @@ -19,7 +19,6 @@ import ( "testing" "github.com/go-logr/logr" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" "k8s.io/utils/pointer" @@ -38,7 +37,7 @@ func TestInitContainer(t *testing.T) { testCases := []struct { job *kubeflowv1.PyTorchJob - rtype commonv1.ReplicaType + rtype kubeflowv1.ReplicaType index string expected int exepctedErr error @@ -46,7 +45,7 @@ func TestInitContainer(t *testing.T) { { job: &kubeflowv1.PyTorchJob{ Spec: kubeflowv1.PyTorchJobSpec{ - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchReplicaSpecs: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PyTorchJobReplicaTypeWorker: { Replicas: pointer.Int32(1), }, @@ -61,7 +60,7 @@ func TestInitContainer(t *testing.T) { { job: &kubeflowv1.PyTorchJob{ Spec: kubeflowv1.PyTorchJobSpec{ - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchReplicaSpecs: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PyTorchJobReplicaTypeWorker: { Replicas: pointer.Int32(1), }, @@ -79,7 +78,7 @@ func TestInitContainer(t *testing.T) { { job: &kubeflowv1.PyTorchJob{ Spec: kubeflowv1.PyTorchJobSpec{ - PyTorchReplicaSpecs: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + PyTorchReplicaSpecs: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PyTorchJobReplicaTypeWorker: { Replicas: pointer.Int32(1), }, diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index 46e8fd1502..8cb378fb15 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -20,7 +20,6 @@ import ( "strings" "time" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" "github.com/kubeflow/training-operator/pkg/common/util" @@ -334,9 +333,9 @@ func (r *PyTorchJobReconciler) DeleteJob(job interface{}) error { } func (jc *PyTorchJobReconciler) GenLabelSelector(jobName string, - rtype commonv1.ReplicaType) *metav1.LabelSelector { + rtype kubeflowv1.ReplicaType) *metav1.LabelSelector { labels := jc.GenLabels(jobName) - labels[commonv1.ReplicaTypeLabel] = strings.ToLower(string(rtype)) + labels[kubeflowv1.ReplicaTypeLabel] = strings.ToLower(string(rtype)) return &metav1.LabelSelector{ MatchLabels: labels, @@ -345,8 +344,8 @@ func (jc *PyTorchJobReconciler) GenLabelSelector(jobName string, // UpdateJobStatus updates the job status and job conditions func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - jobStatus *commonv1.JobStatus) error { + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + jobStatus *kubeflowv1.JobStatus) error { pytorchjob, ok := job.(*kubeflowv1.PyTorchJob) if !ok { return fmt.Errorf("%+v is not a type of PyTorchJob", job) @@ -385,10 +384,10 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, pytorchjob.Name, rtype, expected, running, succeeded, failed, specReplicas) if ContainsMasterSpec(replicas) { - if rtype == commonv1.ReplicaType(kubeflowv1.PyTorchJobReplicaTypeMaster) { + if rtype == kubeflowv1.PyTorchJobReplicaTypeMaster { if running > 0 { msg := fmt.Sprintf("PyTorchJob %s is running.", pytorchjob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, commonutil.JobRunningReason, msg) if err != nil { commonutil.LoggerForJob(pytorchjob).Infof("Append job condition error: %v", err) return err @@ -403,7 +402,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobSucceeded, commonutil.JobSucceededReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, commonutil.JobSucceededReason, msg) if err != nil { commonutil.LoggerForJob(pytorchjob).Infof("Append job condition error: %v", err) return err @@ -427,7 +426,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, jobStatus.CompletionTime = &now } err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobSucceeded, commonutil.JobSucceededReason, msg) + kubeflowv1.JobSucceeded, commonutil.JobSucceededReason, msg) if err != nil { commonutil.LoggerForJob(pytorchjob).Infof("Append pytorchjob condition error: %v", err) return err @@ -437,7 +436,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, // Some workers are still running, leave a running condition. msg := fmt.Sprintf("PyTorchJob %s/%s is running.", pytorchjob.Namespace, pytorchjob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, commonutil.JobRunningReason, msg) if err != nil { commonutil.LoggerForJob(pytorchjob).Infof("Append pytorchjob condition error: %v", err) return err @@ -447,10 +446,10 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, } if failed > 0 && (specReplicas > succeeded+running) { - if spec.RestartPolicy != commonv1.RestartPolicyNever { + if spec.RestartPolicy != kubeflowv1.RestartPolicyNever { msg := fmt.Sprintf("PyTorchJob %s is restarting because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype) r.Recorder.Event(pytorchjob, corev1.EventTypeWarning, commonutil.JobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, commonutil.JobRestartingReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, commonutil.JobRestartingReason, msg) if err != nil { commonutil.LoggerForJob(pytorchjob).Infof("Append job condition error: %v", err) return err @@ -463,7 +462,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobFailed, commonutil.JobFailedReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, commonutil.JobFailedReason, msg) if err != nil { commonutil.LoggerForJob(pytorchjob).Infof("Append job condition error: %v", err) return err @@ -476,7 +475,7 @@ func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{}, } // ContainsMasterSpec returns true if the pytorchjob contains master spec. -func ContainsMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool { +func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool { if _, ok := replicas[kubeflowv1.PyTorchJobReplicaTypeMaster]; ok { return true } @@ -484,9 +483,9 @@ func ContainsMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) } // UpdateJobStatusInApiServer updates the job status in to cluster. -func (r *PyTorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { +func (r *PyTorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = map[commonv1.ReplicaType]*commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} } pytorchjob, ok := job.(*kubeflowv1.PyTorchJob) @@ -533,8 +532,8 @@ func (r *PyTorchJobReconciler) GetDefaultContainerPortName() string { return kubeflowv1.PytorchJobDefaultPortName } -func (r *PyTorchJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - rtype commonv1.ReplicaType, index int) bool { +func (r *PyTorchJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, index int) bool { return string(rtype) == string(kubeflowv1.PyTorchJobReplicaTypeMaster) } @@ -549,7 +548,7 @@ func (r *PyTorchJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool msg := fmt.Sprintf("PyTorchJob %s is created.", e.Object.GetName()) logrus.Info(msg) trainingoperatorcommon.CreatedJobsCounterInc(pytorchjob.Namespace, kubeflowv1.PytorchJobFrameworkName) - if err := commonutil.UpdateJobConditions(&pytorchjob.Status, commonv1.JobCreated, "PyTorchJobCreated", msg); err != nil { + if err := commonutil.UpdateJobConditions(&pytorchjob.Status, kubeflowv1.JobCreated, "PyTorchJobCreated", msg); err != nil { logrus.Error(err, "append job condition error") return false } diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go index 39ab652c52..81b2a29556 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go @@ -25,7 +25,6 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/utils/pointer" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -45,7 +44,7 @@ var _ = Describe("PyTorchJob controller", func() { By("By creating a new PyTorchJob") ctx := context.Background() job := newPyTorchJobForTest(name, namespace) - job.Spec.PyTorchReplicaSpecs = map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + job.Spec.PyTorchReplicaSpecs = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PyTorchJobReplicaTypeMaster: { Replicas: pointer.Int32(1), Template: corev1.PodTemplateSpec{ @@ -159,7 +158,7 @@ var _ = Describe("PyTorchJob controller", func() { ReplicaStatuses[kubeflowv1.PyTorchJobReplicaTypeMaster].Succeeded == 1 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) // Check if the job is succeeded. - cond := getCondition(created.Status, commonv1.JobSucceeded) + cond := getCondition(created.Status, kubeflowv1.JobSucceeded) Expect(cond.Status).To(Equal(corev1.ConditionTrue)) By("Deleting the PyTorchJob") Expect(testK8sClient.Delete(ctx, job)).Should(Succeed()) @@ -188,7 +187,7 @@ var _ = Describe("PyTorchJob controller", func() { MinReplicas: minReplicas, MaxRestarts: maxRestarts, } - job.Spec.PyTorchReplicaSpecs = map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + job.Spec.PyTorchReplicaSpecs = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.PyTorchJobReplicaTypeWorker: { Replicas: pointer.Int32(1), Template: corev1.PodTemplateSpec{ @@ -287,7 +286,7 @@ var _ = Describe("PyTorchJob controller", func() { ReplicaStatuses[kubeflowv1.PyTorchJobReplicaTypeWorker].Succeeded == 1 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) // Check if the job is succeeded. - cond := getCondition(created.Status, commonv1.JobSucceeded) + cond := getCondition(created.Status, kubeflowv1.JobSucceeded) Expect(cond.Status).To(Equal(corev1.ConditionTrue)) By("Deleting the PyTorchJob") Expect(testK8sClient.Delete(ctx, job)).Should(Succeed()) @@ -305,7 +304,7 @@ func newPyTorchJobForTest(name, namespace string) *kubeflowv1.PyTorchJob { } // getCondition returns the condition with the provided type. -func getCondition(status commonv1.JobStatus, condType commonv1.JobConditionType) *commonv1.JobCondition { +func getCondition(status kubeflowv1.JobStatus, condType kubeflowv1.JobConditionType) *kubeflowv1.JobCondition { for _, condition := range status.Conditions { if condition.Type == condType { return &condition diff --git a/pkg/controller.v1/tensorflow/job_test.go b/pkg/controller.v1/tensorflow/job_test.go index f377c325dc..99e8adc16d 100644 --- a/pkg/controller.v1/tensorflow/job_test.go +++ b/pkg/controller.v1/tensorflow/job_test.go @@ -21,9 +21,6 @@ import ( "time" "github.com/google/go-cmp/cmp/cmpopts" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/controller.v1/common" - commonutil "github.com/kubeflow/training-operator/pkg/util" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -36,7 +33,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/common" tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil" + commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -163,7 +162,7 @@ var _ = Describe("TFJob controller", func() { testCases := []testCase{ { description: "4 workers and 2 ps is running, policy is all", - tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, commonv1.CleanPodPolicyAll), + tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyAll), pendingWorkerPods: 0, activeWorkerPods: 4, @@ -182,7 +181,7 @@ var _ = Describe("TFJob controller", func() { }, { description: "4 workers and 2 ps is running, policy is running", - tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, commonv1.CleanPodPolicyRunning), + tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyRunning), pendingWorkerPods: 0, activeWorkerPods: 4, @@ -201,7 +200,7 @@ var _ = Describe("TFJob controller", func() { }, { description: "4 workers and 2 ps is succeeded, policy is running", - tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, commonv1.CleanPodPolicyRunning), + tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyRunning), pendingWorkerPods: 0, activeWorkerPods: 0, @@ -220,7 +219,7 @@ var _ = Describe("TFJob controller", func() { }, { description: "4 workers and 2 ps is succeeded, policy is None", - tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, commonv1.CleanPodPolicyNone), + tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyNone), pendingWorkerPods: 0, activeWorkerPods: 0, @@ -245,7 +244,7 @@ var _ = Describe("TFJob controller", func() { ctx := context.Background() tc.tfJob.SetName(fmt.Sprintf(jobNameTemplate, idx)) tc.tfJob.SetUID(uuid.NewUUID()) - Expect(commonutil.UpdateJobConditions(&tc.tfJob.Status, commonv1.JobSucceeded, tfJobSucceededReason, "")).Should(Succeed()) + Expect(commonutil.UpdateJobConditions(&tc.tfJob.Status, kubeflowv1.JobSucceeded, tfJobSucceededReason, "")).Should(Succeed()) refs := []metav1.OwnerReference{ *reconciler.GenOwnerReference(tc.tfJob), @@ -577,7 +576,7 @@ var _ = Describe("TFJob controller", func() { Expect(reconciler.Status().Update(ctx, &updatedTFJob)).To(Succeed()) By("waiting for updating replicaStatus for workers") - Eventually(func() *commonv1.ReplicaStatus { + Eventually(func() *kubeflowv1.ReplicaStatus { var getTFJob kubeflowv1.TFJob Expect(reconciler.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &getTFJob)).Should(Succeed()) return getTFJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker] diff --git a/pkg/controller.v1/tensorflow/pod_test.go b/pkg/controller.v1/tensorflow/pod_test.go index 860ffc8226..188e0fcab7 100644 --- a/pkg/controller.v1/tensorflow/pod_test.go +++ b/pkg/controller.v1/tensorflow/pod_test.go @@ -19,7 +19,6 @@ import ( "fmt" "os" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -106,8 +105,8 @@ var _ = Describe("TFJob controller", func() { jobName := c.tfJob.GetName() labels := reconciler.GenLabels(jobName) - labels[commonv1.ReplicaTypeLabel] = c.rt - labels[commonv1.ReplicaIndexLabel] = c.index + labels[kubeflowv1.ReplicaTypeLabel] = c.rt + labels[kubeflowv1.ReplicaIndexLabel] = c.index Expect(reconciler.SetClusterSpec(c.tfJob, podTemplate, c.rt, c.index)).Should(Succeed()) @@ -157,12 +156,12 @@ var _ = Describe("TFJob controller", func() { type tc struct { tfJob *kubeflowv1.TFJob expectedRestartPolicy corev1.RestartPolicy - expectedType commonv1.ReplicaType + expectedType kubeflowv1.ReplicaType } testCase := []tc{ func() tc { tfJob := tftestutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyExitCode + specRestartPolicy := kubeflowv1.RestartPolicyExitCode tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].RestartPolicy = specRestartPolicy return tc{ tfJob: tfJob, @@ -172,7 +171,7 @@ var _ = Describe("TFJob controller", func() { }(), func() tc { tfJob := tftestutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyNever + specRestartPolicy := kubeflowv1.RestartPolicyNever tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].RestartPolicy = specRestartPolicy return tc{ tfJob: tfJob, @@ -182,7 +181,7 @@ var _ = Describe("TFJob controller", func() { }(), func() tc { tfJob := tftestutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyAlways + specRestartPolicy := kubeflowv1.RestartPolicyAlways tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].RestartPolicy = specRestartPolicy return tc{ tfJob: tfJob, @@ -192,7 +191,7 @@ var _ = Describe("TFJob controller", func() { }(), func() tc { tfJob := tftestutil.NewTFJob(1, 0) - specRestartPolicy := commonv1.RestartPolicyOnFailure + specRestartPolicy := kubeflowv1.RestartPolicyOnFailure tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].RestartPolicy = specRestartPolicy return tc{ tfJob: tfJob, @@ -218,7 +217,7 @@ var _ = Describe("TFJob controller", func() { tfJob := tftestutil.NewTFJob(1, 0) tfJob.SetName("test-exit-code") tfJob.SetUID(uuid.NewUUID()) - tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].RestartPolicy = commonv1.RestartPolicyExitCode + tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].RestartPolicy = kubeflowv1.RestartPolicyExitCode refs := []metav1.OwnerReference{ *reconciler.GenOwnerReference(tfJob), @@ -420,7 +419,7 @@ var _ = Describe("TFJob controller", func() { // worker failed, succeeded, running num workers [3]int32 tfJob *kubeflowv1.TFJob - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec expected bool expectedErr bool }{ @@ -429,7 +428,7 @@ var _ = Describe("TFJob controller", func() { tfJob: tftestutil.NewTFJobV2(1, 1, 0, 0, 0), expected: false, expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + replicas: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.TFJobReplicaTypeWorker: { Replicas: newInt32(1), Template: tftestutil.NewTFReplicaSpecTemplate(), @@ -445,7 +444,7 @@ var _ = Describe("TFJob controller", func() { tfJob: tftestutil.NewTFJobV2(1, 0, 0, 0, 0), expected: true, expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + replicas: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.TFJobReplicaTypeWorker: { Replicas: newInt32(1), Template: tftestutil.NewTFReplicaSpecTemplate(), @@ -457,7 +456,7 @@ var _ = Describe("TFJob controller", func() { tfJob: tftestutil.NewTFJobV2(0, 0, 1, 0, 0), expected: true, expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + replicas: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.TFJobReplicaTypeMaster: { Replicas: newInt32(1), Template: tftestutil.NewTFReplicaSpecTemplate(), @@ -469,7 +468,7 @@ var _ = Describe("TFJob controller", func() { tfJob: tftestutil.NewTFJobV2(0, 0, 0, 1, 0), expected: true, expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + replicas: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.TFJobReplicaTypeChief: { Replicas: newInt32(1), Template: tftestutil.NewTFReplicaSpecTemplate(), @@ -481,7 +480,7 @@ var _ = Describe("TFJob controller", func() { tfJob: tftestutil.NewTFJobV2(2, 0, 0, 0, 0), expected: true, expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + replicas: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.TFJobReplicaTypeWorker: { Replicas: newInt32(2), Template: tftestutil.NewTFReplicaSpecTemplate(), @@ -493,7 +492,7 @@ var _ = Describe("TFJob controller", func() { tfJob: tftestutil.NewTFJobV2(2, 0, 0, 0, 0), expected: false, expectedErr: false, - replicas: map[commonv1.ReplicaType]*commonv1.ReplicaSpec{ + replicas: map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{ kubeflowv1.TFJobReplicaTypeWorker: { Replicas: newInt32(2), Template: tftestutil.NewTFReplicaSpecTemplate(), diff --git a/pkg/controller.v1/tensorflow/status_test.go b/pkg/controller.v1/tensorflow/status_test.go index d2cf9aaa80..aa17143b90 100644 --- a/pkg/controller.v1/tensorflow/status_test.go +++ b/pkg/controller.v1/tensorflow/status_test.go @@ -18,7 +18,6 @@ import ( "context" "fmt" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -58,7 +57,7 @@ var _ = Describe("TFJob controller", func() { By("finding failed job status") found := false for _, condition := range tfJob.Status.Conditions { - if condition.Type == commonv1.JobFailed { + if condition.Type == kubeflowv1.JobFailed { found = true } } @@ -87,7 +86,7 @@ var _ = Describe("TFJob controller", func() { restart bool worker0Completed bool - expectedType commonv1.JobConditionType + expectedType kubeflowv1.JobConditionType } testCases := []testCase{ @@ -105,7 +104,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobSucceeded, + expectedType: kubeflowv1.JobSucceeded, }, { description: "Chief worker is running", @@ -121,7 +120,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 1, restart: false, worker0Completed: false, - expectedType: commonv1.JobRunning, + expectedType: kubeflowv1.JobRunning, }, { description: "Chief worker is failed", @@ -137,7 +136,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobFailed, + expectedType: kubeflowv1.JobFailed, }, { description: "(No chief worker) Worker is failed", @@ -153,7 +152,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobFailed, + expectedType: kubeflowv1.JobFailed, }, { description: "(No chief worker) Worker is succeeded", @@ -169,7 +168,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobSucceeded, + expectedType: kubeflowv1.JobSucceeded, }, { description: "(No chief worker) Worker is running", @@ -185,7 +184,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobRunning, + expectedType: kubeflowv1.JobRunning, }, { description: "(No chief worker) 2 workers are succeeded, 2 workers are active", @@ -201,7 +200,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobRunning, + expectedType: kubeflowv1.JobRunning, }, { description: "(No chief worker) 2 workers are running, 2 workers are failed", @@ -217,7 +216,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobFailed, + expectedType: kubeflowv1.JobFailed, }, { description: "(No chief worker) 2 workers are succeeded, 2 workers are failed", @@ -233,7 +232,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobFailed, + expectedType: kubeflowv1.JobFailed, }, { description: "(No chief worker) worker-0 are succeeded, 3 workers are active", @@ -249,7 +248,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: true, - expectedType: commonv1.JobSucceeded, + expectedType: kubeflowv1.JobSucceeded, }, { description: "(No chief worker, successPolicy: AllWorkers) worker-0 are succeeded, 3 workers are active", @@ -265,7 +264,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: true, - expectedType: commonv1.JobRunning, + expectedType: kubeflowv1.JobRunning, }, { description: "(No chief worker, successPolicy: AllWorkers) 4 workers are succeeded", @@ -281,7 +280,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: true, - expectedType: commonv1.JobSucceeded, + expectedType: kubeflowv1.JobSucceeded, }, { description: "(No chief worker, successPolicy: AllWorkers) worker-0 is succeeded, 2 workers are running, 1 worker is failed", @@ -297,7 +296,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: true, - expectedType: commonv1.JobFailed, + expectedType: kubeflowv1.JobFailed, }, { description: "Chief is running, workers are failed", @@ -313,7 +312,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 1, restart: false, worker0Completed: false, - expectedType: commonv1.JobRunning, + expectedType: kubeflowv1.JobRunning, }, { description: "Chief is running, workers are succeeded", @@ -329,7 +328,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 1, restart: false, worker0Completed: false, - expectedType: commonv1.JobRunning, + expectedType: kubeflowv1.JobRunning, }, { description: "Chief is running, a PS is failed", @@ -345,7 +344,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 1, restart: false, worker0Completed: false, - expectedType: commonv1.JobFailed, + expectedType: kubeflowv1.JobFailed, }, { description: "Chief is failed, workers are succeeded", @@ -361,7 +360,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobFailed, + expectedType: kubeflowv1.JobFailed, }, { description: "Chief is succeeded, workers are failed", @@ -377,7 +376,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: false, worker0Completed: false, - expectedType: commonv1.JobSucceeded, + expectedType: kubeflowv1.JobSucceeded, }, { description: "Chief is failed and restarting", @@ -393,7 +392,7 @@ var _ = Describe("TFJob controller", func() { expectedActiveChief: 0, restart: true, worker0Completed: false, - expectedType: commonv1.JobRestarting, + expectedType: kubeflowv1.JobRestarting, }, } @@ -457,9 +456,9 @@ var _ = Describe("TFJob controller", func() { }) }) -func setStatusForTest(tfJob *kubeflowv1.TFJob, rtype commonv1.ReplicaType, failed, succeeded, active int32, restart bool, worker0Completed bool, client client.Client) { +func setStatusForTest(tfJob *kubeflowv1.TFJob, rtype kubeflowv1.ReplicaType, failed, succeeded, active int32, restart bool, worker0Completed bool, client client.Client) { if restart == true { - tfJob.Spec.TFReplicaSpecs[rtype].RestartPolicy = commonv1.RestartPolicyExitCode + tfJob.Spec.TFReplicaSpecs[rtype].RestartPolicy = kubeflowv1.RestartPolicyExitCode } basicLabels := reconciler.GenLabels(tfJob.GetName()) @@ -588,10 +587,10 @@ func genKeyFromJob(job client.Object) types.NamespacedName { } } -func filterOutConditionTest(status commonv1.JobStatus) error { +func filterOutConditionTest(status kubeflowv1.JobStatus) error { flag := util.IsFailed(status) || util.IsSucceeded(status) for _, condition := range status.Conditions { - if flag && condition.Type == commonv1.JobRunning && condition.Status == corev1.ConditionTrue { + if flag && condition.Type == kubeflowv1.JobRunning && condition.Status == corev1.ConditionTrue { return fmt.Errorf("error condition status when succeeded or failed") } } diff --git a/pkg/controller.v1/tensorflow/testutil/pod.go b/pkg/controller.v1/tensorflow/testutil/pod.go index 99ca3b9817..da6562d76e 100644 --- a/pkg/controller.v1/tensorflow/testutil/pod.go +++ b/pkg/controller.v1/tensorflow/testutil/pod.go @@ -19,13 +19,13 @@ import ( "fmt" "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -54,15 +54,15 @@ func NewBasePod(name string, job metav1.Object, refs []metav1.OwnerReference) *c } } -func NewPod(job metav1.Object, typ commonv1.ReplicaType, index int, refs []metav1.OwnerReference) *corev1.Pod { +func NewPod(job metav1.Object, typ kubeflowv1.ReplicaType, index int, refs []metav1.OwnerReference) *corev1.Pod { pod := NewBasePod(fmt.Sprintf("%s-%s-%d", job.GetName(), strings.ToLower(string(typ)), index), job, refs) - pod.Labels[commonv1.ReplicaTypeLabel] = strings.ToLower(string(typ)) - pod.Labels[commonv1.ReplicaIndexLabel] = fmt.Sprintf("%d", index) + pod.Labels[kubeflowv1.ReplicaTypeLabel] = strings.ToLower(string(typ)) + pod.Labels[kubeflowv1.ReplicaIndexLabel] = fmt.Sprintf("%d", index) return pod } // NewPodList create count pods with the given phase for the given tfJob -func NewPodList(count int32, status corev1.PodPhase, job metav1.Object, typ commonv1.ReplicaType, start int32, refs []metav1.OwnerReference) []*corev1.Pod { +func NewPodList(count int32, status corev1.PodPhase, job metav1.Object, typ kubeflowv1.ReplicaType, start int32, refs []metav1.OwnerReference) []*corev1.Pod { pods := []*corev1.Pod{} for i := int32(0); i < count; i++ { newPod := NewPod(job, typ, int(start+i), refs) @@ -72,7 +72,7 @@ func NewPodList(count int32, status corev1.PodPhase, job metav1.Object, typ comm return pods } -func SetPodsStatuses(client client.Client, job metav1.Object, typ commonv1.ReplicaType, +func SetPodsStatuses(client client.Client, job metav1.Object, typ kubeflowv1.ReplicaType, pendingPods, activePods, succeededPods, failedPods int32, restartCounts []int32, refs []metav1.OwnerReference, basicLabels map[string]string) { var index int32 diff --git a/pkg/controller.v1/tensorflow/testutil/service.go b/pkg/controller.v1/tensorflow/testutil/service.go index 3bb0e5320b..00e01c628a 100644 --- a/pkg/controller.v1/tensorflow/testutil/service.go +++ b/pkg/controller.v1/tensorflow/testutil/service.go @@ -19,12 +19,13 @@ import ( "fmt" "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) const ( @@ -51,15 +52,15 @@ func NewBaseService(name string, job metav1.Object, refs []metav1.OwnerReference } } -func NewService(job metav1.Object, typ commonv1.ReplicaType, index int, refs []metav1.OwnerReference) *corev1.Service { +func NewService(job metav1.Object, typ kubeflowv1.ReplicaType, index int, refs []metav1.OwnerReference) *corev1.Service { svc := NewBaseService(fmt.Sprintf("%s-%s-%d", job.GetName(), strings.ToLower(string(typ)), index), job, refs) - svc.Labels[commonv1.ReplicaTypeLabel] = strings.ToLower(string(typ)) - svc.Labels[commonv1.ReplicaIndexLabel] = fmt.Sprintf("%d", index) + svc.Labels[kubeflowv1.ReplicaTypeLabel] = strings.ToLower(string(typ)) + svc.Labels[kubeflowv1.ReplicaIndexLabel] = fmt.Sprintf("%d", index) return svc } // NewServiceList creates count pods with the given phase for the given tfJob -func NewServiceList(count int32, job metav1.Object, typ commonv1.ReplicaType, refs []metav1.OwnerReference) []*corev1.Service { +func NewServiceList(count int32, job metav1.Object, typ kubeflowv1.ReplicaType, refs []metav1.OwnerReference) []*corev1.Service { services := []*corev1.Service{} for i := int32(0); i < count; i++ { newService := NewService(job, typ, int(i), refs) @@ -68,7 +69,7 @@ func NewServiceList(count int32, job metav1.Object, typ commonv1.ReplicaType, re return services } -func SetServices(client client.Client, job metav1.Object, typ commonv1.ReplicaType, activeWorkerServices int32, +func SetServices(client client.Client, job metav1.Object, typ kubeflowv1.ReplicaType, activeWorkerServices int32, refs []metav1.OwnerReference, basicLabels map[string]string) { ctx := context.Background() for _, svc := range NewServiceList(activeWorkerServices, job, typ, refs) { diff --git a/pkg/controller.v1/tensorflow/testutil/tfjob.go b/pkg/controller.v1/tensorflow/testutil/tfjob.go index 407717b9a6..65770fc5c1 100644 --- a/pkg/controller.v1/tensorflow/testutil/tfjob.go +++ b/pkg/controller.v1/tensorflow/testutil/tfjob.go @@ -15,7 +15,6 @@ package testutil import ( - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -24,7 +23,7 @@ import ( const TestTFJobName = "test-tfjob" -func NewTFJobWithCleanPolicy(chief, worker, ps int, policy commonv1.CleanPodPolicy) *kubeflowv1.TFJob { +func NewTFJobWithCleanPolicy(chief, worker, ps int, policy kubeflowv1.CleanPodPolicy) *kubeflowv1.TFJob { if chief == 1 { tfJob := NewTFJobWithChief(worker, ps) tfJob.Spec.RunPolicy.CleanPodPolicy = &policy @@ -39,13 +38,13 @@ func NewTFJobWithCleanupJobDelay(chief, worker, ps int, ttl *int32) *kubeflowv1. if chief == 1 { tfJob := NewTFJobWithChief(worker, ps) tfJob.Spec.RunPolicy.TTLSecondsAfterFinished = ttl - policy := commonv1.CleanPodPolicyNone + policy := kubeflowv1.CleanPodPolicyNone tfJob.Spec.RunPolicy.CleanPodPolicy = &policy return tfJob } tfJob := NewTFJob(worker, ps) tfJob.Spec.RunPolicy.TTLSecondsAfterFinished = ttl - policy := commonv1.CleanPodPolicyNone + policy := kubeflowv1.CleanPodPolicyNone tfJob.Spec.RunPolicy.CleanPodPolicy = &policy return tfJob } @@ -54,13 +53,13 @@ func NewTFJobWithActiveDeadlineSeconds(chief, worker, ps int, ads *int64) *kubef if chief == 1 { tfJob := NewTFJobWithChief(worker, ps) tfJob.Spec.RunPolicy.ActiveDeadlineSeconds = ads - policy := commonv1.CleanPodPolicyAll + policy := kubeflowv1.CleanPodPolicyAll tfJob.Spec.RunPolicy.CleanPodPolicy = &policy return tfJob } tfJob := NewTFJob(worker, ps) tfJob.Spec.RunPolicy.ActiveDeadlineSeconds = ads - policy := commonv1.CleanPodPolicyAll + policy := kubeflowv1.CleanPodPolicyAll tfJob.Spec.RunPolicy.CleanPodPolicy = &policy return tfJob } @@ -70,14 +69,14 @@ func NewTFJobWithBackoffLimit(chief, worker, ps int, backoffLimit *int32) *kubef tfJob := NewTFJobWithChief(worker, ps) tfJob.Spec.RunPolicy.BackoffLimit = backoffLimit tfJob.Spec.TFReplicaSpecs["Worker"].RestartPolicy = "OnFailure" - policy := commonv1.CleanPodPolicyAll + policy := kubeflowv1.CleanPodPolicyAll tfJob.Spec.RunPolicy.CleanPodPolicy = &policy return tfJob } tfJob := NewTFJob(worker, ps) tfJob.Spec.RunPolicy.BackoffLimit = backoffLimit tfJob.Spec.TFReplicaSpecs["Worker"].RestartPolicy = "OnFailure" - policy := commonv1.CleanPodPolicyAll + policy := kubeflowv1.CleanPodPolicyAll tfJob.Spec.RunPolicy.CleanPodPolicy = &policy return tfJob } @@ -85,7 +84,7 @@ func NewTFJobWithBackoffLimit(chief, worker, ps int, backoffLimit *int32) *kubef func NewTFJobWithChief(worker, ps int) *kubeflowv1.TFJob { tfJob := NewTFJob(worker, ps) chief := int32(1) - tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = &commonv1.ReplicaSpec{ + tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeChief] = &kubeflowv1.ReplicaSpec{ Replicas: &chief, Template: NewTFReplicaSpecTemplate(), } @@ -96,7 +95,7 @@ func NewTFJobWithEvaluator(worker, ps, evaluator int) *kubeflowv1.TFJob { tfJob := NewTFJob(worker, ps) if evaluator > 0 { evaluator := int32(evaluator) - tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeEval] = &commonv1.ReplicaSpec{ + tfJob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeEval] = &kubeflowv1.ReplicaSpec{ Replicas: &evaluator, Template: NewTFReplicaSpecTemplate(), } @@ -120,14 +119,14 @@ func NewTFJob(worker, ps int) *kubeflowv1.TFJob { Namespace: metav1.NamespaceDefault, }, Spec: kubeflowv1.TFJobSpec{ - TFReplicaSpecs: make(map[commonv1.ReplicaType]*commonv1.ReplicaSpec), + TFReplicaSpecs: make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec), }, } kubeflowv1.SetObjectDefaults_TFJob(tfJob) if worker > 0 { worker := int32(worker) - workerReplicaSpec := &commonv1.ReplicaSpec{ + workerReplicaSpec := &kubeflowv1.ReplicaSpec{ Replicas: &worker, Template: NewTFReplicaSpecTemplate(), } @@ -136,7 +135,7 @@ func NewTFJob(worker, ps int) *kubeflowv1.TFJob { if ps > 0 { ps := int32(ps) - psReplicaSpec := &commonv1.ReplicaSpec{ + psReplicaSpec := &kubeflowv1.ReplicaSpec{ Replicas: &ps, Template: NewTFReplicaSpecTemplate(), } @@ -155,14 +154,14 @@ func NewTFJobV2(worker, ps, master, chief, evaluator int) *kubeflowv1.TFJob { Namespace: metav1.NamespaceDefault, }, Spec: kubeflowv1.TFJobSpec{ - TFReplicaSpecs: make(map[commonv1.ReplicaType]*commonv1.ReplicaSpec), + TFReplicaSpecs: make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec), }, } kubeflowv1.SetObjectDefaults_TFJob(tfJob) if worker > 0 { worker := int32(worker) - workerReplicaSpec := &commonv1.ReplicaSpec{ + workerReplicaSpec := &kubeflowv1.ReplicaSpec{ Replicas: &worker, Template: NewTFReplicaSpecTemplate(), } @@ -171,7 +170,7 @@ func NewTFJobV2(worker, ps, master, chief, evaluator int) *kubeflowv1.TFJob { if ps > 0 { ps := int32(ps) - psReplicaSpec := &commonv1.ReplicaSpec{ + psReplicaSpec := &kubeflowv1.ReplicaSpec{ Replicas: &ps, Template: NewTFReplicaSpecTemplate(), } @@ -180,7 +179,7 @@ func NewTFJobV2(worker, ps, master, chief, evaluator int) *kubeflowv1.TFJob { if master > 0 { master := int32(master) - masterReplicaSpec := &commonv1.ReplicaSpec{ + masterReplicaSpec := &kubeflowv1.ReplicaSpec{ Replicas: &master, Template: NewTFReplicaSpecTemplate(), } @@ -189,7 +188,7 @@ func NewTFJobV2(worker, ps, master, chief, evaluator int) *kubeflowv1.TFJob { if chief > 0 { chief := int32(chief) - chiefReplicaSpec := &commonv1.ReplicaSpec{ + chiefReplicaSpec := &kubeflowv1.ReplicaSpec{ Replicas: &chief, Template: NewTFReplicaSpecTemplate(), } @@ -198,7 +197,7 @@ func NewTFJobV2(worker, ps, master, chief, evaluator int) *kubeflowv1.TFJob { if evaluator > 0 { evaluator := int32(evaluator) - evaluatorReplicaSpec := &commonv1.ReplicaSpec{ + evaluatorReplicaSpec := &kubeflowv1.ReplicaSpec{ Replicas: &evaluator, Template: NewTFReplicaSpecTemplate(), } @@ -241,7 +240,7 @@ func NewTFReplicaSpecTemplate() v1.PodTemplateSpec { } } -func CheckCondition(tfJob *kubeflowv1.TFJob, condition commonv1.JobConditionType, reason string) bool { +func CheckCondition(tfJob *kubeflowv1.TFJob, condition kubeflowv1.JobConditionType, reason string) bool { for _, v := range tfJob.Status.Conditions { if v.Type == condition && v.Status == v1.ConditionTrue && v.Reason == reason { return true diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index a556b961da..904347359b 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -21,7 +21,6 @@ import ( "strings" "time" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" "github.com/kubeflow/training-operator/pkg/common/util" @@ -29,7 +28,7 @@ import ( "github.com/kubeflow/training-operator/pkg/controller.v1/control" "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" commonutil "github.com/kubeflow/training-operator/pkg/util" - train_util "github.com/kubeflow/training-operator/pkg/util/train" + trainutil "github.com/kubeflow/training-operator/pkg/util/train" "github.com/go-logr/logr" "github.com/sirupsen/logrus" @@ -393,7 +392,7 @@ func (r *TFJobReconciler) DeleteJob(job interface{}) error { return nil } -func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error { +func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error { tfJob, ok := job.(*kubeflowv1.TFJob) if !ok { return fmt.Errorf("%v is not a type of TFJob", tfJob) @@ -426,12 +425,12 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 } // For the situation that jobStatus has a restarting condition, and append a running condition, - // the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(), + // the restarting condition will be removed from jobStatus by kubeflowv1.filterOutCondition(), // so we need to record the existing restarting condition for later use. - var existingRestartingCondition *commonv1.JobCondition + var existingRestartingCondition *kubeflowv1.JobCondition for _, condition := range jobStatus.Conditions { - if condition.Type == commonv1.JobRestarting { - existingRestartingCondition = &commonv1.JobCondition{ + if condition.Type == kubeflowv1.JobRestarting { + existingRestartingCondition = &kubeflowv1.JobCondition{ Reason: condition.Reason, Message: condition.Message, } @@ -439,7 +438,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 } // iterate the replica spec based on this order - allTypes := []commonv1.ReplicaType{ + allTypes := []kubeflowv1.ReplicaType{ kubeflowv1.TFJobReplicaTypeChief, kubeflowv1.TFJobReplicaTypeEval, kubeflowv1.TFJobReplicaTypeMaster, @@ -470,7 +469,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 msg := fmt.Sprintf("TFJob %s/%s is running.", tfJob.Namespace, tfJob.Name) err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobRunning, tfJobRunningReason, msg) + kubeflowv1.JobRunning, tfJobRunningReason, msg) if err != nil { commonutil.LoggerForJob(tfJob).Infof( "Append tfjob condition error: %v", err) @@ -486,7 +485,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 jobStatus.CompletionTime = &now } err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobSucceeded, tfJobSucceededReason, msg) + kubeflowv1.JobSucceeded, tfJobSucceededReason, msg) if err != nil { commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) return err @@ -508,7 +507,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 jobStatus.CompletionTime = &now } err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobSucceeded, tfJobSucceededReason, msg) + kubeflowv1.JobSucceeded, tfJobSucceededReason, msg) if err != nil { commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) return err @@ -518,7 +517,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 // Some workers are still running, leave a running condition. msg := fmt.Sprintf("TFJob %s/%s is running.", tfJob.Namespace, tfJob.Name) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, tfJobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, tfJobRunningReason, msg) if err != nil { commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) return err @@ -529,10 +528,10 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 if failed > 0 { // For the situation that jobStatus has a restarting condition, and appends a new running condition, - // the restarting condition will be removed from jobStatus by commonv1.filterOutCondition(), + // the restarting condition will be removed from jobStatus by kubeflowv1.filterOutCondition(), // so we need to append the restarting condition back to jobStatus. if existingRestartingCondition != nil { - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, existingRestartingCondition.Reason, existingRestartingCondition.Message) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, existingRestartingCondition.Reason, existingRestartingCondition.Message) if err != nil { commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) return err @@ -554,7 +553,7 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 jobStatus.CompletionTime = &now } err := commonutil.UpdateJobConditions(jobStatus, - commonv1.JobFailed, tfJobFailedReason, msg) + kubeflowv1.JobFailed, tfJobFailedReason, msg) if err != nil { commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) return err @@ -572,9 +571,9 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1 return nil } -func (r *TFJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { +func (r *TFJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = map[commonv1.ReplicaType]*commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} } tfJob, ok := job.(*kubeflowv1.TFJob) @@ -649,8 +648,8 @@ func (r *TFJobReconciler) GetDefaultContainerPortName() string { return kubeflowv1.TFJobDefaultPortName } -func (r *TFJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - rtype commonv1.ReplicaType, index int) bool { +func (r *TFJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, index int) bool { if ContainsChiefOrMasterSpec(replicas) { return rtype == kubeflowv1.TFJobReplicaTypeChief || rtype == kubeflowv1.TFJobReplicaTypeMaster } @@ -659,7 +658,7 @@ func (r *TFJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*common } // IsWorker0Completed returns true if pod of worker0 succeeded and exited with 0 -func (r *TFJobReconciler) IsWorker0Completed(tfJob *kubeflowv1.TFJob, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) (bool, error) { +func (r *TFJobReconciler) IsWorker0Completed(tfJob *kubeflowv1.TFJob, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) (bool, error) { worker0Completed := false _, ok := replicas[kubeflowv1.TFJobReplicaTypeWorker] if !ok { @@ -708,11 +707,11 @@ func (r *TFJobReconciler) getPodSlices(tfjob *kubeflowv1.TFJob, replicasNum *int // It will requeue the tfjob in case of an error while creating/deleting pods. func (r *TFJobReconciler) ReconcilePods( job interface{}, - jobStatus *commonv1.JobStatus, + jobStatus *kubeflowv1.JobStatus, pods []*v1.Pod, - rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, ) error { tfJob, ok := job.(*kubeflowv1.TFJob) @@ -778,9 +777,9 @@ func (r *TFJobReconciler) ReconcilePods( } // Check if the pod is retryable. if pod.Status.Phase == v1.PodFailed && - (spec.RestartPolicy == commonv1.RestartPolicyExitCode && train_util.IsRetryableExitCode(exitCode) || - spec.RestartPolicy == commonv1.RestartPolicyOnFailure || - spec.RestartPolicy == commonv1.RestartPolicyAlways) { + (spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode && trainutil.IsRetryableExitCode(exitCode) || + spec.RestartPolicy == kubeflowv1.RestartPolicyOnFailure || + spec.RestartPolicy == kubeflowv1.RestartPolicyAlways) { logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name) if err := r.PodControl.DeletePod(pod.Namespace, pod.Name, tfJob); err != nil { return err @@ -791,7 +790,7 @@ func (r *TFJobReconciler) ReconcilePods( msg := fmt.Sprintf("TFJob %s is restarting because %s replica(s) failed.", tfJob.Name, rtype) r.Recorder.Event(tfJob, corev1.EventTypeWarning, tfJobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, tfJobRestartingReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, tfJobRestartingReason, msg) if err != nil { commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err) return err @@ -806,8 +805,8 @@ func (r *TFJobReconciler) ReconcilePods( } // createNewPod creates a new pod for the given index and type. -func (r *TFJobReconciler) createNewPod(tfjob *kubeflowv1.TFJob, rt, index string, spec *commonv1.ReplicaSpec, masterRole bool, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { +func (r *TFJobReconciler) createNewPod(tfjob *kubeflowv1.TFJob, rt, index string, spec *kubeflowv1.ReplicaSpec, masterRole bool, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) error { tfjobKey, err := common.KeyFunc(tfjob) if err != nil { @@ -825,11 +824,11 @@ func (r *TFJobReconciler) createNewPod(tfjob *kubeflowv1.TFJob, rt, index string // Set type and index for the worker. labels := r.GenLabels(tfjob.Name) - labels[commonv1.ReplicaTypeLabel] = rt - labels[commonv1.ReplicaIndexLabel] = index + labels[kubeflowv1.ReplicaTypeLabel] = rt + labels[kubeflowv1.ReplicaIndexLabel] = index if masterRole { - labels[commonv1.JobRoleLabel] = "master" + labels[kubeflowv1.JobRoleLabel] = "master" } podTemplate := spec.Template.DeepCopy() @@ -909,7 +908,7 @@ func (r *TFJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { msg := fmt.Sprintf("TFJob %s is created.", e.Object.GetName()) logrus.Info(msg) trainingoperatorcommon.CreatedJobsCounterInc(tfJob.Namespace, kubeflowv1.TFJobFrameworkName) - if err := commonutil.UpdateJobConditions(&tfJob.Status, commonv1.JobCreated, "TFJobCreated", msg); err != nil { + if err := commonutil.UpdateJobConditions(&tfJob.Status, kubeflowv1.JobCreated, "TFJobCreated", msg); err != nil { log.Log.Error(err, "append job condition error") return false } diff --git a/pkg/controller.v1/tensorflow/tfjob_controller_test.go b/pkg/controller.v1/tensorflow/tfjob_controller_test.go index 3b98b19e1d..f7db36570e 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller_test.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller_test.go @@ -25,7 +25,6 @@ import ( "k8s.io/apimachinery/pkg/util/uuid" "sigs.k8s.io/controller-runtime/pkg/client" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil" ) @@ -34,8 +33,8 @@ var _ = Describe("TFJob controller", func() { Context("Test Normal Path", func() { It("should create desired Pods and Services", func() { var ( - tfJobRunning = commonv1.JobRunning - tfJobSucceeded = commonv1.JobSucceeded + tfJobRunning = kubeflowv1.JobRunning + tfJobSucceeded = kubeflowv1.JobSucceeded ) testCases := map[string]struct { @@ -72,7 +71,7 @@ var _ = Describe("TFJob controller", func() { expectedSucceededPSPods int32 expectedFailedPSPods int32 - expectedCondition *commonv1.JobConditionType + expectedCondition *kubeflowv1.JobConditionType expectedConditionReason string // There are some cases that should not check start time since the field should be set in the previous sync loop. diff --git a/pkg/controller.v1/tensorflow/util.go b/pkg/controller.v1/tensorflow/util.go index ba87bc33c9..0b4d3e3be9 100644 --- a/pkg/controller.v1/tensorflow/util.go +++ b/pkg/controller.v1/tensorflow/util.go @@ -17,12 +17,11 @@ package tensorflow import ( corev1 "k8s.io/api/core/v1" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) // GetPortFromTFJob gets the port of tensorflow container. -func GetPortFromTFJob(tfJob *kubeflowv1.TFJob, rtype commonv1.ReplicaType) (int32, error) { +func GetPortFromTFJob(tfJob *kubeflowv1.TFJob, rtype kubeflowv1.ReplicaType) (int32, error) { containers := tfJob.Spec.TFReplicaSpecs[rtype].Template.Spec.Containers for _, container := range containers { if container.Name == kubeflowv1.TFJobDefaultContainerName { @@ -38,7 +37,7 @@ func GetPortFromTFJob(tfJob *kubeflowv1.TFJob, rtype commonv1.ReplicaType) (int3 } // ContainsChiefOrMasterSpec returns true if the tfjob contains chief or master spec. -func ContainsChiefOrMasterSpec(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool { +func ContainsChiefOrMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool { if _, ok := replicas[kubeflowv1.TFJobReplicaTypeChief]; ok { return true } else if _, ok := replicas[kubeflowv1.TFJobReplicaTypeMaster]; ok { @@ -60,9 +59,9 @@ func getContainerExitCode(pod *corev1.Pod) int32 { } // originally from pkg/controller.v1/tensorflow/pod.go (deleted) -func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *commonv1.ReplicaSpec) { +func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *kubeflowv1.ReplicaSpec) { // This is necessary since restartPolicyExitCode is not supported in v1.PodTemplateSpec - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicyNever } else { podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy) @@ -75,7 +74,7 @@ func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *commonv1.Re func isDistributed(tfjob *kubeflowv1.TFJob) bool { replicas := tfjob.Spec.TFReplicaSpecs distributionCount := 0 - allTypes := []commonv1.ReplicaType{ + allTypes := []kubeflowv1.ReplicaType{ kubeflowv1.TFJobReplicaTypeChief, kubeflowv1.TFJobReplicaTypeEval, kubeflowv1.TFJobReplicaTypeMaster, @@ -97,17 +96,17 @@ func isDistributed(tfjob *kubeflowv1.TFJob) bool { // initializeReplicaStatuses initializes the ReplicaStatuses for replica. // originally from pkg/controller.v1/tensorflow/status.go (deleted) -func initializeReplicaStatuses(jobStatus *commonv1.JobStatus, rtype commonv1.ReplicaType) { +func initializeReplicaStatuses(jobStatus *kubeflowv1.JobStatus, rtype kubeflowv1.ReplicaType) { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = make(map[commonv1.ReplicaType]*commonv1.ReplicaStatus) + jobStatus.ReplicaStatuses = make(map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus) } - jobStatus.ReplicaStatuses[rtype] = &commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses[rtype] = &kubeflowv1.ReplicaStatus{} } // updateJobReplicaStatuses updates the JobReplicaStatuses according to the pod. // originally from pkg/controller.v1/tensorflow/status.go (deleted) -func updateJobReplicaStatuses(jobStatus *commonv1.JobStatus, rtype commonv1.ReplicaType, pod *corev1.Pod) { +func updateJobReplicaStatuses(jobStatus *kubeflowv1.JobStatus, rtype kubeflowv1.ReplicaType, pod *corev1.Pod) { switch pod.Status.Phase { case corev1.PodRunning: jobStatus.ReplicaStatuses[rtype].Active++ diff --git a/pkg/controller.v1/tensorflow/util_test.go b/pkg/controller.v1/tensorflow/util_test.go index dd2c8362d3..32f18f5b12 100644 --- a/pkg/controller.v1/tensorflow/util_test.go +++ b/pkg/controller.v1/tensorflow/util_test.go @@ -17,7 +17,6 @@ package tensorflow import ( "testing" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/uuid" @@ -51,14 +50,14 @@ func TestGenLabels(t *testing.T) { expctedVal := "test-key" labels := reconciler.GenLabels(testJobName) - jobNameLabel := commonv1.JobNameLabel + jobNameLabel := kubeflowv1.JobNameLabel if labels[jobNameLabel] != expctedVal { t.Errorf("Expected %s %s, got %s", jobNameLabel, expctedVal, jobNameLabel) } - if labels[commonv1.OperatorNameLabel] != controllerName { - t.Errorf("Expected %s %s, got %s", commonv1.OperatorNameLabel, controllerName, - labels[commonv1.OperatorNameLabel]) + if labels[kubeflowv1.OperatorNameLabel] != controllerName { + t.Errorf("Expected %s %s, got %s", kubeflowv1.OperatorNameLabel, controllerName, + labels[kubeflowv1.OperatorNameLabel]) } } diff --git a/pkg/controller.v1/xgboost/status.go b/pkg/controller.v1/xgboost/status.go index 219036f9bc..534c7533d1 100644 --- a/pkg/controller.v1/xgboost/status.go +++ b/pkg/controller.v1/xgboost/status.go @@ -5,14 +5,14 @@ import ( "github.com/sirupsen/logrus" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" commonutil "github.com/kubeflow/training-operator/pkg/util" ) -func setRunningCondition(logger *logrus.Entry, jobName string, jobStatus *commonv1.JobStatus) error { +func setRunningCondition(logger *logrus.Entry, jobName string, jobStatus *kubeflowv1.JobStatus) error { msg := fmt.Sprintf("XGBoostJob %s is running.", jobName) - if condition := findStatusCondition(jobStatus.Conditions, commonv1.JobRunning); condition == nil { - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, xgboostJobRunningReason, msg) + if condition := findStatusCondition(jobStatus.Conditions, kubeflowv1.JobRunning); condition == nil { + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, xgboostJobRunningReason, msg) if err != nil { logger.Infof("Append job condition error: %v", err) return err @@ -21,7 +21,7 @@ func setRunningCondition(logger *logrus.Entry, jobName string, jobStatus *common return nil } -func findStatusCondition(conditions []commonv1.JobCondition, conditionType commonv1.JobConditionType) *commonv1.JobCondition { +func findStatusCondition(conditions []kubeflowv1.JobCondition, conditionType kubeflowv1.JobConditionType) *kubeflowv1.JobCondition { for i := range conditions { if conditions[i].Type == conditionType { return &conditions[i] diff --git a/pkg/controller.v1/xgboost/status_test.go b/pkg/controller.v1/xgboost/status_test.go index a3d32261a5..c91e99c1e7 100644 --- a/pkg/controller.v1/xgboost/status_test.go +++ b/pkg/controller.v1/xgboost/status_test.go @@ -8,36 +8,36 @@ import ( "github.com/sirupsen/logrus" corev1 "k8s.io/api/core/v1" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) -var ignoreJobConditionsTimeOpts = cmpopts.IgnoreFields(commonv1.JobCondition{}, "LastUpdateTime", "LastTransitionTime") +var ignoreJobConditionsTimeOpts = cmpopts.IgnoreFields(kubeflowv1.JobCondition{}, "LastUpdateTime", "LastTransitionTime") func TestSetRunningCondition(t *testing.T) { jobName := "test-xbgoostjob" logger := logrus.NewEntry(logrus.New()) tests := map[string]struct { - input []commonv1.JobCondition - want []commonv1.JobCondition + input []kubeflowv1.JobCondition + want []kubeflowv1.JobCondition }{ "input doesn't have a running condition": { - input: []commonv1.JobCondition{ + input: []kubeflowv1.JobCondition{ { - Type: commonv1.JobSucceeded, + Type: kubeflowv1.JobSucceeded, Reason: "XGBoostJobSucceeded", Message: "XGBoostJob test-xbgoostjob is successfully completed.", Status: corev1.ConditionTrue, }, }, - want: []commonv1.JobCondition{ + want: []kubeflowv1.JobCondition{ { - Type: commonv1.JobSucceeded, + Type: kubeflowv1.JobSucceeded, Reason: "XGBoostJobSucceeded", Message: "XGBoostJob test-xbgoostjob is successfully completed.", Status: corev1.ConditionTrue, }, { - Type: commonv1.JobRunning, + Type: kubeflowv1.JobRunning, Reason: "XGBoostJobRunning", Message: "XGBoostJob test-xbgoostjob is running.", Status: corev1.ConditionTrue, @@ -45,29 +45,29 @@ func TestSetRunningCondition(t *testing.T) { }, }, "input has a running condition": { - input: []commonv1.JobCondition{ + input: []kubeflowv1.JobCondition{ { - Type: commonv1.JobFailed, + Type: kubeflowv1.JobFailed, Reason: "XGBoostJobFailed", Message: "XGBoostJob test-sgboostjob is failed because 2 Worker replica(s) failed.", Status: corev1.ConditionTrue, }, { - Type: commonv1.JobRunning, + Type: kubeflowv1.JobRunning, Reason: "XGBoostJobRunning", Message: "XGBoostJob test-xbgoostjob is running.", Status: corev1.ConditionTrue, }, }, - want: []commonv1.JobCondition{ + want: []kubeflowv1.JobCondition{ { - Type: commonv1.JobFailed, + Type: kubeflowv1.JobFailed, Reason: "XGBoostJobFailed", Message: "XGBoostJob test-sgboostjob is failed because 2 Worker replica(s) failed.", Status: corev1.ConditionTrue, }, { - Type: commonv1.JobRunning, + Type: kubeflowv1.JobRunning, Reason: "XGBoostJobRunning", Message: "XGBoostJob test-xbgoostjob is running.", Status: corev1.ConditionTrue, @@ -77,7 +77,7 @@ func TestSetRunningCondition(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - jobStatus := &commonv1.JobStatus{Conditions: tc.input} + jobStatus := &kubeflowv1.JobStatus{Conditions: tc.input} err := setRunningCondition(logger, jobName, jobStatus) if err != nil { t.Fatalf("failed to update job condition: %v", err) @@ -91,23 +91,23 @@ func TestSetRunningCondition(t *testing.T) { func TestFindStatusCondition(t *testing.T) { tests := map[string]struct { - conditions []commonv1.JobCondition - want *commonv1.JobCondition + conditions []kubeflowv1.JobCondition + want *kubeflowv1.JobCondition }{ "conditions have a running condition": { - conditions: []commonv1.JobCondition{ + conditions: []kubeflowv1.JobCondition{ { - Type: commonv1.JobRunning, + Type: kubeflowv1.JobRunning, }, }, - want: &commonv1.JobCondition{ - Type: commonv1.JobRunning, + want: &kubeflowv1.JobCondition{ + Type: kubeflowv1.JobRunning, }, }, "condition doesn't have a running condition": { - conditions: []commonv1.JobCondition{ + conditions: []kubeflowv1.JobCondition{ { - Type: commonv1.JobSucceeded, + Type: kubeflowv1.JobSucceeded, }, }, want: nil, @@ -115,7 +115,7 @@ func TestFindStatusCondition(t *testing.T) { } for name, tc := range tests { t.Run(name, func(t *testing.T) { - got := findStatusCondition(tc.conditions, commonv1.JobRunning) + got := findStatusCondition(tc.conditions, kubeflowv1.JobRunning) if diff := cmp.Diff(tc.want, got, ignoreJobConditionsTimeOpts); len(diff) != 0 { t.Fatalf("Unexpected jobConditions from findStatusCondition (-want,got):\n%s", diff) } diff --git a/pkg/controller.v1/xgboost/xgboost.go b/pkg/controller.v1/xgboost/xgboost.go index 8c2a94e13b..b8f9927aa7 100644 --- a/pkg/controller.v1/xgboost/xgboost.go +++ b/pkg/controller.v1/xgboost/xgboost.go @@ -19,7 +19,6 @@ import ( "strconv" "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -42,7 +41,7 @@ func SetPodEnv(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, inde // Add master offset for worker pods if strings.EqualFold(strings.ToLower(rtype), strings.ToLower(string(kubeflowv1.XGBoostJobReplicaTypeWorker))) { - masterSpec := xgboostjob.Spec.XGBReplicaSpecs[commonv1.ReplicaType(kubeflowv1.XGBoostJobReplicaTypeMaster)] + masterSpec := xgboostjob.Spec.XGBReplicaSpecs[kubeflowv1.XGBoostJobReplicaTypeMaster] masterReplicas := int(*masterSpec.Replicas) rank += masterReplicas } @@ -111,14 +110,14 @@ func SetPodEnv(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, inde return nil } -func replicaName(jobName string, rtype commonv1.ReplicaType, index int) string { +func replicaName(jobName string, rtype kubeflowv1.ReplicaType, index int) string { n := jobName + "-" + strings.ToLower(string(rtype)) + "-" + strconv.Itoa(index) return strings.Replace(n, "/", "-", -1) } // getPortFromXGBoostJob gets the port of xgboost container. -func getPortFromXGBoostJob(job *kubeflowv1.XGBoostJob, rtype commonv1.ReplicaType) (int32, error) { - containers := job.Spec.XGBReplicaSpecs[commonv1.ReplicaType(rtype)].Template.Spec.Containers +func getPortFromXGBoostJob(job *kubeflowv1.XGBoostJob, rtype kubeflowv1.ReplicaType) (int32, error) { + containers := job.Spec.XGBReplicaSpecs[rtype].Template.Spec.Containers for _, container := range containers { if container.Name == kubeflowv1.XGBoostJobDefaultContainerName { ports := container.Ports diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index 4bc804c409..8003f4729e 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -20,7 +20,6 @@ import ( "reflect" "time" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" "github.com/kubeflow/training-operator/pkg/common/util" @@ -345,7 +344,7 @@ func (r *XGBoostJobReconciler) DeleteJob(job interface{}) error { } // UpdateJobStatus updates the job status and job conditions -func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, jobStatus *commonv1.JobStatus) error { +func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error { xgboostJob, ok := job.(*kubeflowv1.XGBoostJob) if !ok { return fmt.Errorf("%+v is not a type of xgboostJob", xgboostJob) @@ -381,7 +380,7 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com logrus.Infof("XGBoostJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d", xgboostJob.Name, rtype, expected, running, succeeded, failed) - if rtype == commonv1.ReplicaType(kubeflowv1.XGBoostJobReplicaTypeMaster) { + if rtype == kubeflowv1.XGBoostJobReplicaTypeMaster { if running > 0 { if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil { return err @@ -399,7 +398,7 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobSucceeded, xgboostJobSucceededReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, xgboostJobSucceededReason, msg) if err != nil { logger.Infof("Append job condition error: %v", err) return err @@ -412,10 +411,10 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com if err := setRunningCondition(logger, xgboostJob.Name, jobStatus); err != nil { return err } - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype) r.Recorder.Event(xgboostJob, corev1.EventTypeWarning, xgboostJobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, xgboostJobRestartingReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, xgboostJobRestartingReason, msg) if err != nil { logger.Infof("Append job condition error: %v", err) return err @@ -428,7 +427,7 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobFailed, xgboostJobFailedReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, xgboostJobFailedReason, msg) if err != nil { logger.Infof("Append job condition error: %v", err) return err @@ -441,9 +440,9 @@ func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[com } // UpdateJobStatusInApiServer updates the job status in to cluster. -func (r *XGBoostJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *commonv1.JobStatus) error { +func (r *XGBoostJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { if jobStatus.ReplicaStatuses == nil { - jobStatus.ReplicaStatuses = map[commonv1.ReplicaType]*commonv1.ReplicaStatus{} + jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} } xgboostjob, ok := job.(*kubeflowv1.XGBoostJob) @@ -480,8 +479,8 @@ func (r *XGBoostJobReconciler) GetDefaultContainerPortName() string { return kubeflowv1.XGBoostJobDefaultPortName } -func (r *XGBoostJobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - rtype commonv1.ReplicaType, index int) bool { +func (r *XGBoostJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + rtype kubeflowv1.ReplicaType, index int) bool { return string(rtype) == string(kubeflowv1.XGBoostJobReplicaTypeMaster) } @@ -496,7 +495,7 @@ func (r *XGBoostJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool msg := fmt.Sprintf("xgboostJob %s is created.", e.Object.GetName()) logrus.Info(msg) trainingoperatorcommon.CreatedJobsCounterInc(xgboostJob.Namespace, kubeflowv1.XGBoostJobFrameworkName) - if err := commonutil.UpdateJobConditions(&xgboostJob.Status, commonv1.JobCreated, xgboostJobCreatedReason, msg); err != nil { + if err := commonutil.UpdateJobConditions(&xgboostJob.Status, kubeflowv1.JobCreated, xgboostJobCreatedReason, msg); err != nil { log.Log.Error(err, "append job condition error") return false } diff --git a/pkg/reconciler.v1/common/gang_scheduler_framework.go b/pkg/reconciler.v1/common/gang_scheduler_framework.go index ef873d3f55..49beb748e4 100644 --- a/pkg/reconciler.v1/common/gang_scheduler_framework.go +++ b/pkg/reconciler.v1/common/gang_scheduler_framework.go @@ -17,7 +17,7 @@ package common import ( "context" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1/common" "github.com/kubeflow/training-operator/pkg/util/k8sutil" @@ -95,8 +95,8 @@ func (r *SchedulerFrameworkReconciler) DeletePodGroup(ctx context.Context, job c func (r *SchedulerFrameworkReconciler) ReconcilePodGroup( ctx context.Context, job client.Object, - runPolicy *commonv1.RunPolicy, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, + runPolicy *kubeflowv1.RunPolicy, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, ) error { minMember := k8sutil.GetTotalReplicas(replicas) var scheduleTimeoutSeconds *int32 @@ -178,7 +178,7 @@ func (r *SchedulerFrameworkReconciler) DecoratePodForGangScheduling( // calcPGMinResources calculates the minimal resources needed for this job. The value will be embedded into the associated PodGroup func (r *SchedulerFrameworkReconciler) calcPGMinResources( minMember int32, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, ) *corev1.ResourceList { return controllerv1.CalcPGMinResources(minMember, replicas, func(pc string) (*schedulingv1.PriorityClass, error) { diff --git a/pkg/reconciler.v1/common/gang_volcano.go b/pkg/reconciler.v1/common/gang_volcano.go index 984b881b28..eddaa6e362 100644 --- a/pkg/reconciler.v1/common/gang_volcano.go +++ b/pkg/reconciler.v1/common/gang_volcano.go @@ -17,7 +17,7 @@ package common import ( "context" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" controllerv1 "github.com/kubeflow/training-operator/pkg/controller.v1/common" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/k8sutil" @@ -107,8 +107,8 @@ func (r *VolcanoReconciler) DeletePodGroup(ctx context.Context, job client.Objec func (r *VolcanoReconciler) ReconcilePodGroup( ctx context.Context, job client.Object, - runPolicy *commonv1.RunPolicy, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { + runPolicy *kubeflowv1.RunPolicy, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) error { minMember := k8sutil.GetTotalReplicas(replicas) queue := "" @@ -192,7 +192,7 @@ func (r *VolcanoReconciler) DecoratePodForGangScheduling(rtype string, podTempla } // calcPGMinResources calculates the minimal resources needed for this job. The value will be embedded into the associated PodGroup -func (r *VolcanoReconciler) calcPGMinResources(minMember int32, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) *corev1.ResourceList { +func (r *VolcanoReconciler) calcPGMinResources(minMember int32, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) *corev1.ResourceList { pcGetFunc := func(pc string) (*schedulingv1.PriorityClass, error) { priorityClass := &schedulingv1.PriorityClass{} err := r.Get(context.Background(), types.NamespacedName{Name: pc}, priorityClass) diff --git a/pkg/reconciler.v1/common/interface.go b/pkg/reconciler.v1/common/interface.go index 8ef3fa7557..59e13cd0ef 100644 --- a/pkg/reconciler.v1/common/interface.go +++ b/pkg/reconciler.v1/common/interface.go @@ -17,7 +17,7 @@ package common import ( "context" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/go-logr/logr" "github.com/sirupsen/logrus" @@ -73,8 +73,8 @@ type GangSchedulingInterface interface { DeletePodGroup(ctx context.Context, job client.Object) error // ReconcilePodGroup CAN be overridden if the logic to reconcile PodGroup changes. - ReconcilePodGroup(ctx context.Context, job client.Object, runPolicy *commonv1.RunPolicy, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error + ReconcilePodGroup(ctx context.Context, job client.Object, runPolicy *kubeflowv1.RunPolicy, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) error // DecoratePodForGangScheduling SHOULD be overridden if gang scheduler demands Pods associated with PodGroup to be // decorated with specific requests. @@ -106,15 +106,15 @@ type PodInterface interface { ReconcilePods( ctx context.Context, job client.Object, - jobStatus *commonv1.JobStatus, + jobStatus *kubeflowv1.JobStatus, pods []*corev1.Pod, - rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error + rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) error // CreateNewPod CAN be overridden to customize how to create a new pod. CreateNewPod(job client.Object, rt string, index string, - spec *commonv1.ReplicaSpec, masterRole bool, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error + spec *kubeflowv1.ReplicaSpec, masterRole bool, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) error // DeletePod CAN be overridden to customize how to delete a pod of {name} in namespace {ns}. DeletePod(ctx context.Context, ns string, name string) error @@ -130,7 +130,7 @@ type ServiceInterface interface { OverrideForServiceInterface(ui ReconcilerUtilInterface, pi PodInterface, ji JobInterface) // GetPortsFromJob CAN be overridden to customize how to find ports defined in the ReplicasSpec. - GetPortsFromJob(spec *commonv1.ReplicaSpec) (map[string]int32, error) + GetPortsFromJob(spec *kubeflowv1.ReplicaSpec) (map[string]int32, error) // GetServicesForJob CAN be overridden to customize how to find all services associated with this job. GetServicesForJob(ctx context.Context, job client.Object) ([]*corev1.Service, error) @@ -145,12 +145,12 @@ type ServiceInterface interface { ReconcileServices( job client.Object, services []*corev1.Service, - rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec) error + rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec) error // CreateNewService CAN be overridden to customize how to create a new service. - CreateNewService(job client.Object, rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec, index string) error + CreateNewService(job client.Object, rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec, index string) error // DeleteService CAN be overridden to customize how to delete the service of {name} in namespace {ns}. DeleteService(ns string, name string, job client.Object) error @@ -175,25 +175,25 @@ type JobInterface interface { GetJob(ctx context.Context, req ctrl.Request) (client.Object, error) // ExtractReplicasSpec MUST be overridden to extract ReplicasSpec from a job - ExtractReplicasSpec(job client.Object) (map[commonv1.ReplicaType]*commonv1.ReplicaSpec, error) + ExtractReplicasSpec(job client.Object) (map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, error) // ExtractRunPolicy MUST be overridden to extract the pointer of RunPolicy from a job - ExtractRunPolicy(job client.Object) (*commonv1.RunPolicy, error) + ExtractRunPolicy(job client.Object) (*kubeflowv1.RunPolicy, error) // ExtractJobStatus MUST be overridden to extract the pointer of JobStatus from a job - ExtractJobStatus(job client.Object) (*commonv1.JobStatus, error) + ExtractJobStatus(job client.Object) (*kubeflowv1.JobStatus, error) // IsMasterRole MUST be overridden to determine whether this ReplicaType with index specified is a master role. // MasterRole pod will have "job-role=master" set in its label - IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool + IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, rtype kubeflowv1.ReplicaType, index int) bool // ReconcileJob CAN be overridden to customize how to reconcile a job. ReconcileJob( ctx context.Context, job client.Object, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - status *commonv1.JobStatus, - runPolicy *commonv1.RunPolicy) error + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + status *kubeflowv1.JobStatus, + runPolicy *kubeflowv1.RunPolicy) error // DeleteJob CAN be overridden to customize how to delete a job. DeleteJob(job client.Object) error @@ -201,41 +201,41 @@ type JobInterface interface { // UpdateJobStatus CAN be overridden to customize how to update job status without submitting to APIServer. UpdateJobStatus( job client.Object, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - jobStatus *commonv1.JobStatus) error + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + jobStatus *kubeflowv1.JobStatus) error // UpdateJobStatusInAPIServer CAN be overridden to customize how to update job status directly to APIServer. UpdateJobStatusInAPIServer(ctx context.Context, job client.Object) error // CleanupResources CAN be overridden to customize how to delete all resources associated with this job. - CleanupResources(runPolicy *commonv1.RunPolicy, status commonv1.JobStatus, job client.Object) error + CleanupResources(runPolicy *kubeflowv1.RunPolicy, status kubeflowv1.JobStatus, job client.Object) error // CleanupJob CAN be overridden to customize how to clean up this job. - CleanupJob(runPolicy *commonv1.RunPolicy, status commonv1.JobStatus, job client.Object) error + CleanupJob(runPolicy *kubeflowv1.RunPolicy, status kubeflowv1.JobStatus, job client.Object) error // RecordAbnormalPods CAN be overridden to customize how to record abnormal pods RecordAbnormalPods(activePods []*corev1.Pod, object client.Object) // SetStatusForSuccessJob CAN be overridden to customize how to set status for success job - SetStatusForSuccessJob(status *commonv1.JobStatus) + SetStatusForSuccessJob(status *kubeflowv1.JobStatus) // IsFlagReplicaTypeForJobStatus CAN be overridden to customize how to determine if this ReplicaType is the // flag ReplicaType for the status of this kind of job IsFlagReplicaTypeForJobStatus(rtype string) bool // IsJobSucceeded CAN be overridden to customize how to determine if this job is succeeded. - IsJobSucceeded(status commonv1.JobStatus) bool + IsJobSucceeded(status kubeflowv1.JobStatus) bool // IsJobFailed CAN be overridden to customize how to determine if this job is failed. - IsJobFailed(status commonv1.JobStatus) bool + IsJobFailed(status kubeflowv1.JobStatus) bool // ShouldCleanUp CAN be overridden to customize how to determine if this job should be cleaned up. - ShouldCleanUp(status commonv1.JobStatus) bool + ShouldCleanUp(status kubeflowv1.JobStatus) bool // PastBackoffLimit CAN be overridden to customize how to determine if this job has past backoff limit. - PastBackoffLimit(jobName string, runPolicy *commonv1.RunPolicy, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, pods []*corev1.Pod) (bool, error) + PastBackoffLimit(jobName string, runPolicy *kubeflowv1.RunPolicy, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, pods []*corev1.Pod) (bool, error) // PastActiveDeadline CAN be overridden to customize how to determine if this job has past activate deadline. - PastActiveDeadline(runPolicy *commonv1.RunPolicy, jobStatus *commonv1.JobStatus) bool + PastActiveDeadline(runPolicy *kubeflowv1.RunPolicy, jobStatus *kubeflowv1.JobStatus) bool } diff --git a/pkg/reconciler.v1/common/job.go b/pkg/reconciler.v1/common/job.go index c301ece309..d8c52f57a9 100644 --- a/pkg/reconciler.v1/common/job.go +++ b/pkg/reconciler.v1/common/job.go @@ -21,9 +21,7 @@ import ( "strings" "time" - ctrl "sigs.k8s.io/controller-runtime" - - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/pkg/core" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/k8sutil" @@ -33,6 +31,7 @@ import ( "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -102,8 +101,8 @@ func (r *JobReconciler) OverrideForJobInterface(ui ReconcilerUtilInterface, pi P func (r *JobReconciler) GenLabels(jobName string) map[string]string { jobName = strings.Replace(jobName, "/", "-", -1) return map[string]string{ - commonv1.OperatorNameLabel: r.GetReconcilerName(), - commonv1.JobNameLabel: jobName, + kubeflowv1.OperatorNameLabel: r.GetReconcilerName(), + kubeflowv1.JobNameLabel: jobName, } } @@ -116,9 +115,9 @@ func (r *JobReconciler) GetGroupNameLabelValue() string { func (r *JobReconciler) ReconcileJob( ctx context.Context, job client.Object, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - status *commonv1.JobStatus, - runPolicy *commonv1.RunPolicy) error { + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + status *kubeflowv1.JobStatus, + runPolicy *kubeflowv1.RunPolicy) error { logger := r.GetLogger(job) logger.Info(MsgReconcileStart) @@ -214,7 +213,7 @@ func (r *JobReconciler) ReconcileJob( r.GetRecorder().Event(job, corev1.EventTypeNormal, commonutil.JobFailedReason, failureMessage) - if err = commonutil.UpdateJobConditions(status, commonv1.JobFailed, commonutil.JobFailedReason, failureMessage); err != nil { + if err = commonutil.UpdateJobConditions(status, kubeflowv1.JobFailed, commonutil.JobFailedReason, failureMessage); err != nil { logrus.Infof(ErrAppendJobConditionTemplate, err) return err } @@ -270,7 +269,7 @@ func (r *JobReconciler) RecordAbnormalPods(activePods []*corev1.Pod, object clie } // SetStatusForSuccessJob sets the status for job that succeed -func (r *JobReconciler) SetStatusForSuccessJob(status *commonv1.JobStatus) { +func (r *JobReconciler) SetStatusForSuccessJob(status *kubeflowv1.JobStatus) { for rytpe := range status.ReplicaStatuses { status.ReplicaStatuses[rytpe].Succeeded += status.ReplicaStatuses[rytpe].Active status.ReplicaStatuses[rytpe].Active = 0 @@ -280,8 +279,8 @@ func (r *JobReconciler) SetStatusForSuccessJob(status *commonv1.JobStatus) { // UpdateJobStatus updates the status of this generic training job WITHOUT pushing the updated status to the APIServer func (r *JobReconciler) UpdateJobStatus( job client.Object, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, - jobStatus *commonv1.JobStatus) error { + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, + jobStatus *kubeflowv1.JobStatus) error { logrus.Warnf(WarnDefaultImplementationTemplate, "UpdateJobStatus") @@ -307,7 +306,7 @@ func (r *JobReconciler) UpdateJobStatus( if r.IsFlagReplicaTypeForJobStatus(string(rtype)) { if running > 0 { msg := fmt.Sprintf("%s %s is running.", jobKind, jobNamespacedName) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, commonutil.JobRunningReason, msg) if err != nil { logger.Info(ErrAppendJobConditionTemplate, err) return err @@ -322,7 +321,7 @@ func (r *JobReconciler) UpdateJobStatus( now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobSucceeded, commonutil.JobSucceededReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, commonutil.JobSucceededReason, msg) if err != nil { logger.Info(ErrAppendJobConditionTemplate, err) } @@ -331,11 +330,11 @@ func (r *JobReconciler) UpdateJobStatus( } if failed > 0 { - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { msg := fmt.Sprintf("%s %s is restarting because %d %s replica(s) failed.", jobKind, jobNamespacedName, failed, rtype) r.GetRecorder().Event(job, corev1.EventTypeWarning, commonutil.JobRestartingReason, msg) - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, commonutil.JobRestartingReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, commonutil.JobRestartingReason, msg) if err != nil { logger.Info(ErrAppendJobConditionTemplate, err) return err @@ -347,7 +346,7 @@ func (r *JobReconciler) UpdateJobStatus( now := metav1.Now() jobStatus.CompletionTime = &now } - err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobFailed, commonutil.JobFailedReason, msg) + err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, commonutil.JobFailedReason, msg) if err != nil { logger.Info(ErrAppendJobConditionTemplate, err) return err @@ -360,7 +359,7 @@ func (r *JobReconciler) UpdateJobStatus( msg := fmt.Sprintf("%s %s is running.", jobKind, jobNamespacedName) logger.Info(msg) - if err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRunning, commonutil.JobRunningReason, msg); err != nil { + if err := commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, commonutil.JobRunningReason, msg); err != nil { logger.Error(err, ErrUpdateJobConditionsFailed, jobKind) return err } @@ -374,12 +373,12 @@ func (r *JobReconciler) UpdateJobStatusInAPIServer(ctx context.Context, job clie } // CleanupResources cleans up all resources associated with this generic training job -func (r *JobReconciler) CleanupResources(runPolicy *commonv1.RunPolicy, status commonv1.JobStatus, job client.Object) error { - if *runPolicy.CleanPodPolicy == commonv1.CleanPodPolicyNone { +func (r *JobReconciler) CleanupResources(runPolicy *kubeflowv1.RunPolicy, status kubeflowv1.JobStatus, job client.Object) error { + if *runPolicy.CleanPodPolicy == kubeflowv1.CleanPodPolicyNone { return nil } ctx := context.Background() - cleanRunningPod := *runPolicy.CleanPodPolicy == commonv1.CleanPodPolicyRunning + cleanRunningPod := *runPolicy.CleanPodPolicy == kubeflowv1.CleanPodPolicyRunning if err := r.DeletePodGroup(ctx, job); err != nil { return err @@ -416,7 +415,7 @@ func (r *JobReconciler) CleanupResources(runPolicy *commonv1.RunPolicy, status c } // CleanupJob cleans up all resources associated with this generic training job as well as the job itself -func (r *JobReconciler) CleanupJob(runPolicy *commonv1.RunPolicy, status commonv1.JobStatus, job client.Object) error { +func (r *JobReconciler) CleanupJob(runPolicy *kubeflowv1.RunPolicy, status kubeflowv1.JobStatus, job client.Object) error { currentTime := time.Now() ttl := runPolicy.TTLSecondsAfterFinished @@ -451,28 +450,28 @@ func (r *JobReconciler) IsFlagReplicaTypeForJobStatus(rtype string) bool { } // IsJobSucceeded checks if this generic training job succeeded -func (r *JobReconciler) IsJobSucceeded(status commonv1.JobStatus) bool { +func (r *JobReconciler) IsJobSucceeded(status kubeflowv1.JobStatus) bool { return commonutil.IsSucceeded(status) } // IsJobFailed checks if this generic training job failed -func (r *JobReconciler) IsJobFailed(status commonv1.JobStatus) bool { +func (r *JobReconciler) IsJobFailed(status kubeflowv1.JobStatus) bool { return commonutil.IsFailed(status) } // ShouldCleanUp checks if resources associated with this generic training job should be cleaned up -func (r *JobReconciler) ShouldCleanUp(status commonv1.JobStatus) bool { +func (r *JobReconciler) ShouldCleanUp(status kubeflowv1.JobStatus) bool { return r.IsJobSucceeded(status) || r.IsJobFailed(status) } // PastBackoffLimit checks if this generic training job has past backoff limit -func (r *JobReconciler) PastBackoffLimit(jobName string, runPolicy *commonv1.RunPolicy, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, pods []*corev1.Pod) (bool, error) { +func (r *JobReconciler) PastBackoffLimit(jobName string, runPolicy *kubeflowv1.RunPolicy, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, pods []*corev1.Pod) (bool, error) { return core.PastBackoffLimit(jobName, runPolicy, replicas, pods, r.FilterPodsForReplicaType) } // PastActiveDeadline checks if this generic training job has ActiveDeadlineSeconds field set and if it is exceeded. -func (r *JobReconciler) PastActiveDeadline(runPolicy *commonv1.RunPolicy, jobStatus *commonv1.JobStatus) bool { +func (r *JobReconciler) PastActiveDeadline(runPolicy *kubeflowv1.RunPolicy, jobStatus *kubeflowv1.JobStatus) bool { return core.PastActiveDeadline(runPolicy, *jobStatus) } @@ -480,18 +479,18 @@ func (r *JobReconciler) GetJob(ctx context.Context, req ctrl.Request) (client.Ob panic("implement me") } -func (r *JobReconciler) ExtractReplicasSpec(job client.Object) (map[commonv1.ReplicaType]*commonv1.ReplicaSpec, error) { +func (r *JobReconciler) ExtractReplicasSpec(job client.Object) (map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, error) { panic("implement me") } -func (r *JobReconciler) ExtractRunPolicy(job client.Object) (*commonv1.RunPolicy, error) { +func (r *JobReconciler) ExtractRunPolicy(job client.Object) (*kubeflowv1.RunPolicy, error) { panic("implement me") } -func (r *JobReconciler) ExtractJobStatus(job client.Object) (*commonv1.JobStatus, error) { +func (r *JobReconciler) ExtractJobStatus(job client.Object) (*kubeflowv1.JobStatus, error) { panic("implement me") } -func (r *JobReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool { +func (r *JobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, rtype kubeflowv1.ReplicaType, index int) bool { panic("implement me") } diff --git a/pkg/reconciler.v1/common/pod.go b/pkg/reconciler.v1/common/pod.go index 6078ec40f0..1a4c3d63f2 100644 --- a/pkg/reconciler.v1/common/pod.go +++ b/pkg/reconciler.v1/common/pod.go @@ -27,7 +27,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/pkg/core" commonutil "github.com/kubeflow/training-operator/pkg/util" trainutil "github.com/kubeflow/training-operator/pkg/util/train" @@ -119,11 +119,11 @@ func (r *PodReconciler) FilterPodsForReplicaType(pods []*corev1.Pod, replicaType func (r *PodReconciler) ReconcilePods( ctx context.Context, job client.Object, - jobStatus *commonv1.JobStatus, + jobStatus *kubeflowv1.JobStatus, pods []*corev1.Pod, - rType commonv1.ReplicaType, - spec *commonv1.ReplicaSpec, - replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { + rType kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec, + replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) error { rt := strings.ToLower(string(rType)) // Convert ReplicaType to lower string. @@ -152,7 +152,7 @@ func (r *PodReconciler) ReconcilePods( logger.Infof("Need to create new pod: %s-%d", rt, index) // check if this replica is the master role - masterRole = r.IsMasterRole(replicas, commonv1.ReplicaType(rt), index) + masterRole = r.IsMasterRole(replicas, kubeflowv1.ReplicaType(rt), index) err = r.CreateNewPod(job, rt, strconv.Itoa(index), spec, masterRole, replicas) if err != nil { return err @@ -180,7 +180,7 @@ func (r *PodReconciler) ReconcilePods( } } // Check if the pod is retryable. - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { if pod.Status.Phase == corev1.PodFailed && trainutil.IsRetryableExitCode(exitCode) { failedPodsCount.Inc() logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name) @@ -199,15 +199,15 @@ func (r *PodReconciler) ReconcilePods( // CreateNewPod generate Pods for this job and submits creation request to APIServer func (r *PodReconciler) CreateNewPod(job client.Object, rt string, index string, - spec *commonv1.ReplicaSpec, masterRole bool, replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) error { + spec *kubeflowv1.ReplicaSpec, masterRole bool, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) error { logger := commonutil.LoggerForReplica(job, rt) podLabels := r.GenLabels(job.GetName()) - podLabels[commonv1.ReplicaTypeLabel] = rt - podLabels[commonv1.ReplicaIndexLabel] = index + podLabels[kubeflowv1.ReplicaTypeLabel] = rt + podLabels[kubeflowv1.ReplicaIndexLabel] = index if masterRole { - podLabels[commonv1.JobRoleLabel] = "master" + podLabels[kubeflowv1.JobRoleLabel] = "master" } podTemplate := spec.Template.DeepCopy() @@ -227,7 +227,7 @@ func (r *PodReconciler) CreateNewPod(job client.Object, rt string, index string, logger.Warning(errMsg) r.GetRecorder().Event(job, corev1.EventTypeWarning, "SettedPodTemplateRestartPolicy", errMsg) } - if spec.RestartPolicy == commonv1.RestartPolicyExitCode { + if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { podTemplate.Spec.RestartPolicy = corev1.RestartPolicyNever } else { podTemplate.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy) diff --git a/pkg/reconciler.v1/common/pod_test.go b/pkg/reconciler.v1/common/pod_test.go index 03714dbd9c..e4a9d02e35 100644 --- a/pkg/reconciler.v1/common/pod_test.go +++ b/pkg/reconciler.v1/common/pod_test.go @@ -17,7 +17,7 @@ package common_test import ( "testing" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" testjobv1 "github.com/kubeflow/training-operator/test_job/apis/test_job/v1" "github.com/kubeflow/training-operator/test_job/reconciler.v1/test_job" testutilv1 "github.com/kubeflow/training-operator/test_job/test_util/v1" @@ -81,7 +81,7 @@ func TestFilterPodsForReplicaType(t *testing.T) { Name: "pod0", Namespace: "default", Labels: map[string]string{ - commonv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeMaster), + kubeflowv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeMaster), }, }, Spec: corev1.PodSpec{}, @@ -93,7 +93,7 @@ func TestFilterPodsForReplicaType(t *testing.T) { Name: "pod1", Namespace: "default", Labels: map[string]string{ - commonv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeWorker), + kubeflowv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeWorker), }, }, Spec: corev1.PodSpec{}, @@ -105,7 +105,7 @@ func TestFilterPodsForReplicaType(t *testing.T) { Name: "pod2", Namespace: "default", Labels: map[string]string{ - commonv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeWorker), + kubeflowv1.ReplicaTypeLabel: string(testjobv1.TestReplicaTypeWorker), }, }, Spec: corev1.PodSpec{}, diff --git a/pkg/reconciler.v1/common/service.go b/pkg/reconciler.v1/common/service.go index f3400a9883..7aad4351bb 100644 --- a/pkg/reconciler.v1/common/service.go +++ b/pkg/reconciler.v1/common/service.go @@ -19,7 +19,7 @@ import ( "strconv" "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/pkg/core" commonutil "github.com/kubeflow/training-operator/pkg/util" @@ -72,7 +72,7 @@ func (r *ServiceReconciler) OverrideForServiceInterface(ui ReconcilerUtilInterfa } // GetPortsFromJob gets the ports of job container. Port could be nil, if distributed communication strategy doesn't need and no other ports that need to be exposed. -func (r *ServiceReconciler) GetPortsFromJob(spec *commonv1.ReplicaSpec) (map[string]int32, error) { +func (r *ServiceReconciler) GetPortsFromJob(spec *kubeflowv1.ReplicaSpec) (map[string]int32, error) { defaultContainerName := r.GetDefaultContainerName() return core.GetPortsFromJob(spec, defaultContainerName) } @@ -108,8 +108,8 @@ func (r *ServiceReconciler) GetServiceSlices(services []*corev1.Service, replica func (r *ServiceReconciler) ReconcileServices( job client.Object, services []*corev1.Service, - rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec) error { + rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec) error { // Convert ReplicaType to lower string. rt := strings.ToLower(string(rtype)) @@ -155,15 +155,15 @@ func (r *ServiceReconciler) ReconcileServices( } // CreateNewService generates Service based the job, replica info. and index and submits it to APIServer -func (r *ServiceReconciler) CreateNewService(job client.Object, rtype commonv1.ReplicaType, - spec *commonv1.ReplicaSpec, index string) error { +func (r *ServiceReconciler) CreateNewService(job client.Object, rtype kubeflowv1.ReplicaType, + spec *kubeflowv1.ReplicaSpec, index string) error { // Convert ReplicaType to lower string. rt := strings.ToLower(string(rtype)) // Append ReplicaTypeLabel and ReplicaIndexLabel labels. labels := r.GenLabels(job.GetName()) - labels[commonv1.ReplicaTypeLabel] = rt - labels[commonv1.ReplicaIndexLabel] = index + labels[kubeflowv1.ReplicaTypeLabel] = rt + labels[kubeflowv1.ReplicaIndexLabel] = index ports, err := r.GetPortsFromJob(spec) if err != nil { diff --git a/pkg/reconciler.v1/common/service_test.go b/pkg/reconciler.v1/common/service_test.go index 6075fb8474..1c34d196a0 100644 --- a/pkg/reconciler.v1/common/service_test.go +++ b/pkg/reconciler.v1/common/service_test.go @@ -20,10 +20,10 @@ import ( "strings" "testing" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" testjobv1 "github.com/kubeflow/training-operator/test_job/apis/test_job/v1" "github.com/kubeflow/training-operator/test_job/reconciler.v1/test_job" - test_utilv1 "github.com/kubeflow/training-operator/test_job/test_util/v1" + testutilv1 "github.com/kubeflow/training-operator/test_job/test_util/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -34,14 +34,14 @@ import ( func TestCreateNewService(t *testing.T) { type tc struct { testJob *testjobv1.TestJob - testRType commonv1.ReplicaType - testSpec *commonv1.ReplicaSpec + testRType kubeflowv1.ReplicaType + testSpec *kubeflowv1.ReplicaSpec testIndex string expectedService *corev1.Service } testCase := []tc{ func() tc { - tj := test_utilv1.NewTestJob(3) + tj := testutilv1.NewTestJob(3) jobName := "testjob1" tj.SetName(jobName) idx := "0" @@ -59,16 +59,16 @@ func TestCreateNewService(t *testing.T) { }, ClusterIP: corev1.ClusterIPNone, Selector: map[string]string{ - commonv1.OperatorNameLabel: "Test Reconciler", - commonv1.JobNameLabel: jobName, - commonv1.ReplicaTypeLabel: strings.ToLower(string(testjobv1.TestReplicaTypeWorker)), - commonv1.ReplicaIndexLabel: idx, + kubeflowv1.OperatorNameLabel: "Test Reconciler", + kubeflowv1.JobNameLabel: jobName, + kubeflowv1.ReplicaTypeLabel: strings.ToLower(string(testjobv1.TestReplicaTypeWorker)), + kubeflowv1.ReplicaIndexLabel: idx, }, }, } return tc{ testJob: tj, - testRType: commonv1.ReplicaType(testjobv1.TestReplicaTypeWorker), + testRType: kubeflowv1.ReplicaType(testjobv1.TestReplicaTypeWorker), testSpec: tj.Spec.TestReplicaSpecs[testjobv1.TestReplicaTypeWorker], testIndex: idx, expectedService: svc, diff --git a/pkg/reconciler.v1/common/utils_test.go b/pkg/reconciler.v1/common/utils_test.go index 4b489aa3ea..b8744904e4 100644 --- a/pkg/reconciler.v1/common/utils_test.go +++ b/pkg/reconciler.v1/common/utils_test.go @@ -17,7 +17,7 @@ package common_test import ( "testing" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/kubeflow/training-operator/test_job/reconciler.v1/test_job" ) @@ -32,8 +32,8 @@ func TestGenLabels(t *testing.T) { return tc{ testJobName: "test/job1", expectedLabel: map[string]string{ - commonv1.JobNameLabel: "test-job1", - commonv1.OperatorNameLabel: "Test Reconciler", + kubeflowv1.JobNameLabel: "test-job1", + kubeflowv1.OperatorNameLabel: "Test Reconciler", }, } }(), diff --git a/test_job/apis/test_job/v1/constants.go b/test_job/apis/test_job/v1/constants.go index 595fe770b3..a793c69367 100644 --- a/test_job/apis/test_job/v1/constants.go +++ b/test_job/apis/test_job/v1/constants.go @@ -15,7 +15,7 @@ package v1 import ( - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) const ( @@ -29,5 +29,5 @@ const ( // DefaultPort is default value of the port. DefaultPort = 2222 // DefaultRestartPolicy is default RestartPolicy for TFReplicaSpec. - DefaultRestartPolicy = commonv1.RestartPolicyNever + DefaultRestartPolicy = kubeflowv1.RestartPolicyNever ) diff --git a/test_job/apis/test_job/v1/defaults.go b/test_job/apis/test_job/v1/defaults.go index f17c1d698b..6fa1e73019 100644 --- a/test_job/apis/test_job/v1/defaults.go +++ b/test_job/apis/test_job/v1/defaults.go @@ -17,9 +17,10 @@ package v1 import ( "strings" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" + + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) // Int32 is a helper routine that allocates a new int32 value @@ -57,7 +58,7 @@ func setDefaultPort(spec *v1.PodSpec) { } } -func setDefaultReplicas(spec *commonv1.ReplicaSpec) { +func setDefaultReplicas(spec *kubeflowv1.ReplicaSpec) { if spec.Replicas == nil { spec.Replicas = Int32(1) } @@ -89,7 +90,7 @@ func setTypeNameToCamelCase(testJob *TestJob, typ TestReplicaType) { func SetDefaults_TestJob(testjob *TestJob) { // Set default RunPolicy if testjob.Spec.RunPolicy == nil { - testjob.Spec.RunPolicy = &commonv1.RunPolicy{ + testjob.Spec.RunPolicy = &kubeflowv1.RunPolicy{ CleanPodPolicy: nil, TTLSecondsAfterFinished: nil, ActiveDeadlineSeconds: nil, @@ -100,7 +101,7 @@ func SetDefaults_TestJob(testjob *TestJob) { // Set default cleanpod policy to Running. if testjob.Spec.RunPolicy.CleanPodPolicy == nil { - running := commonv1.CleanPodPolicyRunning + running := kubeflowv1.CleanPodPolicyRunning testjob.Spec.RunPolicy.CleanPodPolicy = &running } diff --git a/test_job/apis/test_job/v1/types.go b/test_job/apis/test_job/v1/types.go index 5b59608714..d60ca4dc1c 100644 --- a/test_job/apis/test_job/v1/types.go +++ b/test_job/apis/test_job/v1/types.go @@ -15,7 +15,7 @@ package v1 import ( - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -37,17 +37,17 @@ type TestJob struct { // This data may not be up to date. // Populated by the system. // Read-only. - Status commonv1.JobStatus `json:"status,omitempty"` + Status kubeflowv1.JobStatus `json:"status,omitempty"` } // TestJobSpec is a desired state description of the TestJob. type TestJobSpec struct { - RunPolicy *commonv1.RunPolicy `json:"runPolicy,omitempty"` - TestReplicaSpecs map[TestReplicaType]*commonv1.ReplicaSpec `json:"testReplicaSpecs"` + RunPolicy *kubeflowv1.RunPolicy `json:"runPolicy,omitempty"` + TestReplicaSpecs map[TestReplicaType]*kubeflowv1.ReplicaSpec `json:"testReplicaSpecs"` } // TestReplicaType is the type for TestReplica. -type TestReplicaType commonv1.ReplicaType +type TestReplicaType kubeflowv1.ReplicaType const ( TestReplicaTypeWorker TestReplicaType = "Worker" diff --git a/test_job/reconciler.v1/test_job/test_job_reconciler.go b/test_job/reconciler.v1/test_job/test_job_reconciler.go index 8ae5aad792..b6a139d060 100644 --- a/test_job/reconciler.v1/test_job/test_job_reconciler.go +++ b/test_job/reconciler.v1/test_job/test_job_reconciler.go @@ -3,8 +3,8 @@ package test_job import ( "context" - commonv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - common_reconciler "github.com/kubeflow/training-operator/pkg/reconciler.v1/common" + kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + commonreconciler "github.com/kubeflow/training-operator/pkg/reconciler.v1/common" v1 "github.com/kubeflow/training-operator/test_job/apis/test_job/v1" "github.com/kubeflow/training-operator/test_job/client/clientset/versioned/scheme" @@ -19,11 +19,11 @@ import ( ) type TestReconciler struct { - common_reconciler.ReconcilerUtil - common_reconciler.ServiceReconciler - common_reconciler.PodReconciler - common_reconciler.VolcanoReconciler - common_reconciler.JobReconciler + commonreconciler.ReconcilerUtil + commonreconciler.ServiceReconciler + commonreconciler.PodReconciler + commonreconciler.VolcanoReconciler + commonreconciler.JobReconciler FC client.Client Job *v1.TestJob @@ -44,21 +44,21 @@ func NewTestReconciler() *TestReconciler { } // Generate Bare Components - jobR := common_reconciler.BareJobReconciler(fakeClient) + jobR := commonreconciler.BareJobReconciler(fakeClient) jobR.OverrideForJobInterface(r, r, r, r) - podR := common_reconciler.BarePodReconciler(fakeClient) + podR := commonreconciler.BarePodReconciler(fakeClient) podR.OverrideForPodInterface(r, r, r) - svcR := common_reconciler.BareServiceReconciler(fakeClient) + svcR := commonreconciler.BareServiceReconciler(fakeClient) svcR.OverrideForServiceInterface(r, r, r) - gangR := common_reconciler.BareVolcanoReconciler(fakeClient, nil, false) + gangR := commonreconciler.BareVolcanoReconciler(fakeClient, nil, false) gangR.OverrideForGangSchedulingInterface(r) Log := log.Log - utilR := common_reconciler.BareUtilReconciler(nil, Log, scm) - //kubeflowReconciler := common_reconciler.BareKubeflowReconciler() + utilR := commonreconciler.BareUtilReconciler(nil, Log, scm) + //kubeflowReconciler := commonreconciler.BareKubeflowReconciler() r.JobReconciler = *jobR r.PodReconciler = *podR @@ -136,29 +136,29 @@ func (r *TestReconciler) GetServicesForJob(ctx context.Context, job client.Objec return r.Services, nil } -func (r *TestReconciler) ExtractReplicasSpec(job client.Object) (map[commonv1.ReplicaType]*commonv1.ReplicaSpec, error) { +func (r *TestReconciler) ExtractReplicasSpec(job client.Object) (map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, error) { tj := job.(*v1.TestJob) - rs := map[commonv1.ReplicaType]*commonv1.ReplicaSpec{} + rs := map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec{} for k, v := range tj.Spec.TestReplicaSpecs { - rs[commonv1.ReplicaType(k)] = v + rs[kubeflowv1.ReplicaType(k)] = v } return rs, nil } -func (r *TestReconciler) ExtractRunPolicy(job client.Object) (*commonv1.RunPolicy, error) { +func (r *TestReconciler) ExtractRunPolicy(job client.Object) (*kubeflowv1.RunPolicy, error) { tj := job.(*v1.TestJob) return tj.Spec.RunPolicy, nil } -func (r *TestReconciler) ExtractJobStatus(job client.Object) (*commonv1.JobStatus, error) { +func (r *TestReconciler) ExtractJobStatus(job client.Object) (*kubeflowv1.JobStatus, error) { tj := job.(*v1.TestJob) return &tj.Status, nil } -func (r *TestReconciler) IsMasterRole(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, rtype commonv1.ReplicaType, index int) bool { +func (r *TestReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, rtype kubeflowv1.ReplicaType, index int) bool { return string(rtype) == string(v1.TestReplicaTypeMaster) }