-
Notifications
You must be signed in to change notification settings - Fork 718
/
Copy pathtest_e2e_tfjob.py
150 lines (124 loc) · 4.96 KB
/
test_e2e_tfjob.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# Copyright 2021 kubeflow.org.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import logging
import pytest
from kubernetes.client import V1PodTemplateSpec
from kubernetes.client import V1ObjectMeta
from kubernetes.client import V1PodSpec
from kubernetes.client import V1Container
from kubernetes.client import V1ResourceRequirements
from kubeflow.training import TrainingClient
from kubeflow.training import V1ReplicaSpec
from kubeflow.training import V1RunPolicy
from kubeflow.training import KubeflowOrgV1TFJob
from kubeflow.training import KubeflowOrgV1TFJobSpec
from kubeflow.training import V1SchedulingPolicy
from kubeflow.training.constants import constants
from test.e2e.utils import verify_job_e2e, verify_unschedulable_job_e2e, get_pod_spec_scheduler_name
from test.e2e.constants import TEST_GANG_SCHEDULER_NAME_ENV_KEY
from test.e2e.constants import GANG_SCHEDULERS, NONE_GANG_SCHEDULERS
logging.basicConfig(format="%(message)s")
logging.getLogger().setLevel(logging.INFO)
TRAINING_CLIENT = TrainingClient()
JOB_NAME = "tfjob-mnist-ci-test"
CONTAINER_NAME = "tensorflow"
GANG_SCHEDULER_NAME = os.getenv(TEST_GANG_SCHEDULER_NAME_ENV_KEY)
@pytest.mark.skipif(
GANG_SCHEDULER_NAME in NONE_GANG_SCHEDULERS, reason="For gang-scheduling",
)
def test_sdk_e2e_with_gang_scheduling(job_namespace):
container = generate_container()
worker = V1ReplicaSpec(
replicas=1,
restart_policy="Never",
template=V1PodTemplateSpec(
metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
spec=V1PodSpec(
containers=[container],
scheduler_name=get_pod_spec_scheduler_name(GANG_SCHEDULER_NAME),
)
),
)
unschedulable_tfjob = generate_tfjob(worker, V1SchedulingPolicy(min_available=10), job_namespace)
schedulable_tfjob = generate_tfjob(worker, V1SchedulingPolicy(min_available=1), job_namespace)
TRAINING_CLIENT.create_tfjob(unschedulable_tfjob, job_namespace)
logging.info(f"List of created {constants.TFJOB_KIND}s")
logging.info(TRAINING_CLIENT.list_tfjobs(job_namespace))
verify_unschedulable_job_e2e(
TRAINING_CLIENT,
JOB_NAME,
job_namespace,
constants.TFJOB_KIND,
)
TRAINING_CLIENT.patch_tfjob(schedulable_tfjob, JOB_NAME, job_namespace)
logging.info(f"List of patched {constants.TFJOB_KIND}s")
logging.info(TRAINING_CLIENT.list_tfjobs(job_namespace))
verify_job_e2e(
TRAINING_CLIENT,
JOB_NAME,
job_namespace,
constants.TFJOB_KIND,
CONTAINER_NAME,
)
TRAINING_CLIENT.delete_tfjob(JOB_NAME, job_namespace)
@pytest.mark.skipif(
GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling",
)
def test_sdk_e2e(job_namespace):
container = generate_container()
worker = V1ReplicaSpec(
replicas=1,
restart_policy="Never",
template=V1PodTemplateSpec(metadata=V1ObjectMeta(annotations={constants.ISTIO_SIDECAR_INJECTION: "false"}),
spec=V1PodSpec(containers=[container])),
)
tfjob = generate_tfjob(worker, job_namespace=job_namespace)
TRAINING_CLIENT.create_tfjob(tfjob, job_namespace)
logging.info(f"List of created {constants.TFJOB_KIND}s")
logging.info(TRAINING_CLIENT.list_tfjobs(job_namespace))
verify_job_e2e(
TRAINING_CLIENT, JOB_NAME, job_namespace, constants.TFJOB_KIND, CONTAINER_NAME,
)
TRAINING_CLIENT.delete_tfjob(JOB_NAME, job_namespace)
def generate_tfjob(
worker: V1ReplicaSpec,
scheduling_policy: V1SchedulingPolicy = None,
job_namespace: str = "default",
) -> KubeflowOrgV1TFJob:
return KubeflowOrgV1TFJob(
api_version="kubeflow.org/v1",
kind="TFJob",
metadata=V1ObjectMeta(name=JOB_NAME, namespace=job_namespace),
spec=KubeflowOrgV1TFJobSpec(
run_policy=V1RunPolicy(
clean_pod_policy="None",
scheduling_policy=scheduling_policy,
),
tf_replica_specs={"Worker": worker},
),
)
def generate_container() -> V1Container:
return V1Container(
name=CONTAINER_NAME,
image="gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0",
command=[
"python",
"/var/tf_mnist/mnist_with_summaries.py",
"--log_dir=/train/logs",
"--learning_rate=0.01",
"--batch_size=150",
],
resources=V1ResourceRequirements(limits={"memory": "2Gi", "cpu": "0.75"}),
)