Skip to content

Commit

Permalink
Add suspend field to runPolicy
Browse files Browse the repository at this point in the history
  • Loading branch information
tenzen-y committed Jul 11, 2023
1 parent 9e084ff commit 938a343
Show file tree
Hide file tree
Showing 26 changed files with 94 additions and 1 deletion.
2 changes: 2 additions & 0 deletions docs/api/kubeflow.org_v1_generated.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,8 @@ RunPolicy encapsulates various runtime policies of the distributed training job,
| *`activeDeadlineSeconds`* __integer__ | Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer.
| *`backoffLimit`* __integer__ | Optional number of retries before marking this job failed.
| *`schedulingPolicy`* __xref:{anchor_prefix}-github-com-kubeflow-training-operator-pkg-apis-kubeflow-org-v1-schedulingpolicy[$$SchedulingPolicy$$]__ | SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling
| *`suspend`* __boolean__ | suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job.
Defaults to false.
|===


Expand Down
4 changes: 4 additions & 0 deletions hack/python-sdk/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,10 @@
"description": "SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling",
"$ref": "#/definitions/kubeflow.org.v1.SchedulingPolicy"
},
"suspend": {
"description": "suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job.\n\nDefaults to false.",
"type": "boolean"
},
"ttlSecondsAfterFinished": {
"description": "TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite.",
"type": "integer",
Expand Down
16 changes: 16 additions & 0 deletions pkg/apis/kubeflow.org/v1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ const (
// The training is complete without error.
JobSucceeded JobConditionType = "Succeeded"

// JobSuspended means the job has been suspended.
JobSuspended JobConditionType = "Suspended"

// JobFailed means one or more sub-resources (e.g. services/pods) of this job
// reached phase failed with no restarting.
// The training has failed its execution.
Expand Down Expand Up @@ -205,6 +208,19 @@ type RunPolicy struct {
// SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling
// +optional
SchedulingPolicy *SchedulingPolicy `json:"schedulingPolicy,omitempty"`

// suspend specifies whether the Job controller should create Pods or not.
// If a Job is created with suspend set to true, no Pods are created by
// the Job controller. If a Job is suspended after creation (i.e. the
// flag goes from false to true), the Job controller will delete all
// active Pods and PodGroups associated with this Job.
// Users must design their workload to gracefully handle this.
// Suspending a Job will reset the StartTime field of the Job.
//
// Defaults to false.
// +kubebuilder:default:=false
// +optional
Suspend *bool `json:"suspend,omitempty"`
}

// SchedulingPolicy encapsulates various scheduling policies of the distributed training
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/kubeflow.org/v1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pkg/apis/kubeflow.org/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions sdk/python/docs/KubeflowOrgV1RunPolicy.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Name | Type | Description | Notes
**backoff_limit** | **int** | Optional number of retries before marking this job failed. | [optional]
**clean_pod_policy** | **str** | CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. | [optional]
**scheduling_policy** | [**KubeflowOrgV1SchedulingPolicy**](KubeflowOrgV1SchedulingPolicy.md) | | [optional]
**suspend** | **bool** | suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job. Defaults to false. | [optional]
**ttl_seconds_after_finished** | **int** | TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite. | [optional]

[[Back to Model list]](../README.md#documentation-for-models) [[Back to API list]](../README.md#documentation-for-api-endpoints) [[Back to README]](../README.md)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class KubeflowOrgV1RunPolicy(object):
'backoff_limit': 'int',
'clean_pod_policy': 'str',
'scheduling_policy': 'KubeflowOrgV1SchedulingPolicy',
'suspend': 'bool',
'ttl_seconds_after_finished': 'int'
}

Expand All @@ -45,10 +46,11 @@ class KubeflowOrgV1RunPolicy(object):
'backoff_limit': 'backoffLimit',
'clean_pod_policy': 'cleanPodPolicy',
'scheduling_policy': 'schedulingPolicy',
'suspend': 'suspend',
'ttl_seconds_after_finished': 'ttlSecondsAfterFinished'
}

def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_policy=None, scheduling_policy=None, ttl_seconds_after_finished=None, local_vars_configuration=None): # noqa: E501
def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_policy=None, scheduling_policy=None, suspend=None, ttl_seconds_after_finished=None, local_vars_configuration=None): # noqa: E501
"""KubeflowOrgV1RunPolicy - a model defined in OpenAPI""" # noqa: E501
if local_vars_configuration is None:
local_vars_configuration = Configuration()
Expand All @@ -58,6 +60,7 @@ def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_p
self._backoff_limit = None
self._clean_pod_policy = None
self._scheduling_policy = None
self._suspend = None
self._ttl_seconds_after_finished = None
self.discriminator = None

Expand All @@ -69,6 +72,8 @@ def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_p
self.clean_pod_policy = clean_pod_policy
if scheduling_policy is not None:
self.scheduling_policy = scheduling_policy
if suspend is not None:
self.suspend = suspend
if ttl_seconds_after_finished is not None:
self.ttl_seconds_after_finished = ttl_seconds_after_finished

Expand Down Expand Up @@ -162,6 +167,29 @@ def scheduling_policy(self, scheduling_policy):

self._scheduling_policy = scheduling_policy

@property
def suspend(self):
"""Gets the suspend of this KubeflowOrgV1RunPolicy. # noqa: E501
suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job. Defaults to false. # noqa: E501
:return: The suspend of this KubeflowOrgV1RunPolicy. # noqa: E501
:rtype: bool
"""
return self._suspend

@suspend.setter
def suspend(self, suspend):
"""Sets the suspend of this KubeflowOrgV1RunPolicy.
suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job. Defaults to false. # noqa: E501
:param suspend: The suspend of this KubeflowOrgV1RunPolicy. # noqa: E501
:type: bool
"""

self._suspend = suspend

@property
def ttl_seconds_after_finished(self):
"""Gets the ttl_seconds_after_finished of this KubeflowOrgV1RunPolicy. # noqa: E501
Expand Down
1 change: 1 addition & 0 deletions sdk/python/test/test_kubeflow_org_v1_mpi_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
slots_per_worker = 56, ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_mpi_job_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
slots_per_worker = 56, ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
Expand Down Expand Up @@ -118,6 +119,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
slots_per_worker = 56, ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
Expand Down
1 change: 1 addition & 0 deletions sdk/python/test/test_kubeflow_org_v1_mpi_job_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
slots_per_worker = 56
)
Expand Down
1 change: 1 addition & 0 deletions sdk/python/test/test_kubeflow_org_v1_mx_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_mx_job_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down Expand Up @@ -115,6 +116,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_mx_job_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, )
)
else :
Expand All @@ -78,6 +79,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
)

Expand Down
1 change: 1 addition & 0 deletions sdk/python/test/test_kubeflow_org_v1_paddle_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_paddle_job_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down Expand Up @@ -127,6 +128,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_paddle_job_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, )
)
else :
Expand All @@ -83,6 +84,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
)

Expand Down
1 change: 1 addition & 0 deletions sdk/python/test/test_kubeflow_org_v1_py_torch_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_py_torch_job_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down Expand Up @@ -149,6 +150,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ), ),
status = kubeflow_org_v1_job_status.KubeflowOrgV1JobStatus(
completion_time = None,
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_py_torch_job_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, )
)
else :
Expand All @@ -94,6 +95,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
)

Expand Down
1 change: 1 addition & 0 deletions sdk/python/test/test_kubeflow_org_v1_run_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56
)
else :
Expand Down
1 change: 1 addition & 0 deletions sdk/python/test/test_kubeflow_org_v1_tf_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
success_policy = '0',
tf_replica_specs = {
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_tf_job_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
success_policy = '0',
tf_replica_specs = {
Expand Down Expand Up @@ -110,6 +111,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
success_policy = '0',
tf_replica_specs = {
Expand Down
2 changes: 2 additions & 0 deletions sdk/python/test/test_kubeflow_org_v1_tf_job_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
success_policy = '0',
tf_replica_specs = {
Expand All @@ -72,6 +73,7 @@ def make_instance(self, include_optional):
priority_class = '0',
queue = '0',
schedule_timeout_seconds = 56, ),
suspend = True,
ttl_seconds_after_finished = 56, ),
tf_replica_specs = {
'key' : kubeflow_org_v1_replica_spec.KubeflowOrgV1ReplicaSpec(
Expand Down
Loading

0 comments on commit 938a343

Please sign in to comment.