Skip to content

Commit

Permalink
Fix the xgboost_synthetic test so it actually runs and produces signal
Browse files Browse the repository at this point in the history
* The test wasn't actually running because we were passing arguments that
  were unknown to pytest

* Remove the old role.yaml; we don't use it anymore

* Wait for the Job to finish and properly report status; kubeflow/testing#514
  contains the new routine

* The test still isn't passing because of kubeflow#673

* In addition we need to fix the auto deployments kubeflow/testing#444

Related to kubeflow#665
  • Loading branch information
Jeremy Lewi committed Nov 2, 2019
1 parent 452aa42 commit 52b0770
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 115 deletions.
4 changes: 1 addition & 3 deletions py/kubeflow/examples/create_e2e_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,11 +238,9 @@ def _build_tests_dag(self):
# Test xgboost
step_name = "xgboost-synthetic"
command = ["pytest", "xgboost_test.py",
# I think -s mean stdout/stderr will print out to aid in debugging.
# Failures still appear to be captured and stored in the junit file.
"-s",
# Increase the log level so that info level log statements show up.
"--log-cli-level=info",
"--log-cli-format='%(levelname)s|%(asctime)s|%(pathname)s|%(lineno)d| %(message)'",
# Test timeout in seconds.
"--timeout=1800",
"--junitxml=" + self.artifacts_dir + "/junit_xgboost-synthetic-test.xml",
Expand Down
7 changes: 0 additions & 7 deletions xgboost_synthetic/testing/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ def pytest_addoption(parser):
parser.addoption(
"--repos", help="The repos to checkout; leave blank to use defaults",
type=str, default="")
parser.addoption(
"--cluster", help="The cluster which the applition running in", type=str,
default="")

@pytest.fixture
def name(request):
Expand All @@ -33,7 +30,3 @@ def image(request):
@pytest.fixture
def repos(request):
return request.config.getoption("--repos")

@pytest.fixture
def cluster(request):
return request.config.getoption("--cluster")
14 changes: 4 additions & 10 deletions xgboost_synthetic/testing/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ apiVersion: batch/v1
kind: Job
metadata:
name: xgboost-test
labels:
app: xgboost-synthetic-test
spec:
backoffLimit: 1
template:
metadata:
annotations:
Expand All @@ -11,7 +14,7 @@ spec:
# in notebooks?
sidecar.istio.io/inject: "false"
labels:
app: xgboost-synthetics-testing
app: xgboost-synthetic-test
spec:
restartPolicy: Never
securityContext:
Expand All @@ -22,20 +25,11 @@ spec:
- /usr/local/bin/checkout_repos.sh
- --repos=kubeflow/examples@$(CHECK_TAG)
- --src_dir=/src
# TODO(jlewi): Do we need to do depth all here?
- --depth=all
name: checkout
# TODO(jlewi): Set in kustomization.yaml?
image: gcr.io/kubeflow-ci/test-worker:v20190802-c6f9140-e3b0c4
volumeMounts:
- mountPath: /src
name: src
env:
- name: CHECK_TAG
valueFrom:
configMapKeyRef:
name: xgb-notebooks-tests
key: checkTag
containers:
- name: executing-notebooks
image: execute-image
Expand Down
37 changes: 0 additions & 37 deletions xgboost_synthetic/testing/role.yaml

This file was deleted.

79 changes: 21 additions & 58 deletions xgboost_synthetic/testing/xgboost_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,19 @@

from kubernetes import client as k8s_client
from kubernetes.client import rest
from kubeflow.testing import argo_build_util
from kubeflow.testing import util

# TODO(jlewi): This test is currently failing because various things
# need to be updated to work with 0.7.0. Until that's fixed we mark it
# as expected to fail so we can begin to get signal.
@pytest.mark.xfail
def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pylint: disable=too-many-branches,too-many-statements
repos, image, app_dir):
def test_xgboost_synthetic(record_xml_attribute, name, namespace, # pylint: disable=too-many-branches,too-many-statements
repos, image):
'''Generate Job and summit.'''
util.set_pytest_junit(record_xml_attribute, "test_xgboost_synthetic")
app_dir = os.path.abspath(app_dir)

if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
util.run(['gcloud', 'auth', 'activate-service-account',
"--key-file=" + os.getenv("GOOGLE_APPLICATION_CREDENTIALS")],
cwd=app_dir)

# TODO(jlewi): We should just assume that kubeconfig has been set.
if cluster:
util.run(['gcloud', '--project=kubeflow-ci-deployment', 'container',
"clusters", "get-credentials", "--zone=us-east1-b", cluster],
cwd=app_dir)
util.maybe_activate_service_account()

with open("job.yaml") as hf:
job = yaml.load(hf)
Expand All @@ -42,16 +33,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
# See
# https://github.com/kubernetes/test-infra/blob/45246b09ed105698aa8fb928b7736d14480def29/prow/jobs.md#job-environment-variables
if not repos:
version = "@HEAD"
if os.getenv("PULL_NUMBER"):
version = "@{0}:{1}".format(os.getenv("PULL_PULL_SHA"),
os.getenv("PULL_NUMBER"))

else:
if os.getenv("PULL_BASE_SHA"):
version = "@{0}".format(os.getenv("PULL_BASE_SHA"))

repos = "kubeflow/examples" + version
repos = argo_build_util.get_repo_from_prow_env()

logging.info("Repos set to %s", repos)
job["spec"]["template"]["spec"]["initContainers"][0]["command"] = [
Expand All @@ -69,6 +51,7 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
job["metadata"]["name"] = ("xgboost-test-" +
datetime.datetime.now().strftime("%H%M%S")
+ "-" + uuid.uuid4().hex[0:3])
name = job["metadata"]["name"]

job["metadata"]["namespace"] = namespace

Expand All @@ -77,43 +60,23 @@ def test_xgboost_synthetic(record_xml_attribute, name, namespace, cluster, # pyl
batch_api = k8s_client.BatchV1Api(api_client)

logging.info("Creating job:\n%s", yaml.dump(job))
batch_api.create_namespaced_job(job["metadata"]["namespace"], job)
actual_job = batch_api.create_namespaced_job(job["metadata"]["namespace"],
job)
logging.info("Created job %s in namespaces %s", name, namespace)

# Wait for job.
end_time = datetime.datetime.now() + datetime.timedelta(
minutes=15)

last_condition = None
while datetime.datetime.now() < end_time:
try:
job = batch_api.read_namespaced_job(name, namespace)
except rest.ApiException as e:
logging.error("There was a problem getting job %s.%s; %s",
namespace, name, e)
time.sleep(10)
continue
# ready_replicas could be None
if not job.conditions:
logging.info("Job missing condition")
time.sleep(10)
continue

last_condition = job.conditions[-1]
if last_condition["type"] in ["Failed", "Complete"]:
break
logging.info("Waiting for job %s.%s", namespace, name)
time.sleep(10)

logging.info("Final Job spec:\n%s", yaml.safe_dump(job))
util.run(["kubectl", "describe", "job", "-n", namespace, name])

if not last_condition or last_condition["type"] not in ["Failed", "Complete"]:
logging.error("Timeout waiting for job %s.%s to finish.", namespace, name)
raise RuntimeError("Job {0}.{1} has last condition {2} which is not "
"Complete".format(namespace, name,
last_condition["type"] in ["Failed", "Complete"]))
assert last_condition["type"] == "Complete"
final_job = util.wait_for_job(api_client, namespace, name,
timeout=datetime.timedelta(minutes=30))

logging.info("Final job:\n%s", yaml.safe_dump(final_job))

if not job.status.conditions:
raise RuntimeError("Job {0}.{1}; did not complete".format(namespace, name))

last_condition = job.status.conditions[-1]

if last_condition.type not in ["Complete"]:
logging.error("Job didn't complete successfully")
raise RuntimeError("Job {0}.{1} failed".format(namespace, name))

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO,
Expand Down

0 comments on commit 52b0770

Please sign in to comment.