diff --git a/testing/e2e/conftest.py b/testing/e2e/conftest.py new file mode 100644 index 000000000..fbc70d557 --- /dev/null +++ b/testing/e2e/conftest.py @@ -0,0 +1,76 @@ +import pytest + +def pytest_addoption(parser): + parser.addoption( + "--app_path", action="store", default="", + help="Path where the KF application should be stored") + + parser.addoption( + "--app_name", action="store", default="", + help="Name of the KF application") + + parser.addoption( + "--kfctl_path", action="store", default="", + help="Path to kfctl.") + + parser.addoption( + "--namespace", action="store", default="kubeflow", + help="Namespace to use.") + + parser.addoption( + "--project", action="store", default="kubeflow-ci-deployment", + help="GCP project to deploy Kubeflow to") + + parser.addoption( + "--config_path", action="store", default="", + help="The config to use for kfctl init") + + parser.addoption( + "--use_basic_auth", action="store", default="False", + help="Use basic auth.") + + parser.addoption( + "--use_istio", action="store", default="False", + help="Use istio.") + +@pytest.fixture +def app_path(request): + return request.config.getoption("--app_path") + +@pytest.fixture +def app_name(request): + return request.config.getoption("--app_name") + +@pytest.fixture +def kfctl_path(request): + return request.config.getoption("--kfctl_path") + +@pytest.fixture +def namespace(request): + return request.config.getoption("--namespace") + +@pytest.fixture +def project(request): + return request.config.getoption("--project") + +@pytest.fixture +def config_path(request): + return request.config.getoption("--config_path") + +@pytest.fixture +def use_basic_auth(request): + value = request.config.getoption("--use_basic_auth").lower() + + if value in ["t", "true"]: + return True + else: + return False + +@pytest.fixture +def use_istio(request): + value = request.config.getoption("--use_istio").lower() + + if value in ["t", "true"]: + return True + else: + return False \ No newline at end of file diff --git a/testing/e2e/endpoint_ready_test.py b/testing/e2e/endpoint_ready_test.py new file mode 100644 index 000000000..96c9a3464 --- /dev/null +++ b/testing/e2e/endpoint_ready_test.py @@ -0,0 +1,36 @@ +import datetime +import logging +import os +import subprocess +import tempfile +import uuid +from retrying import retry + +import pytest + +from kubeflow.testing import util +from testing import deploy_utils +from testing import gcp_util + +def test_endpoint_is_ready(project, app_name): + """Test that Kubeflow was successfully deployed. + + Args: + project: The gcp project that we deployed kubeflow + app_name: The name of the kubeflow deployment + """ + # Owned by project kubeflow-ci-deployment. + os.environ["CLIENT_ID"] = "29647740582-7meo6c7a9a76jvg54j0g2lv8lrsb4l8g.apps.googleusercontent.com" + if not gcp_util.endpoint_is_ready( + "https://{}.endpoints.{}.cloud.goog".format(app_name, project), + wait_min=25): + raise Exception("Endpoint not ready") + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + pytest.main() diff --git a/testing/e2e/kf_is_ready_test.py b/testing/e2e/kf_is_ready_test.py new file mode 100644 index 000000000..9762979bb --- /dev/null +++ b/testing/e2e/kf_is_ready_test.py @@ -0,0 +1,104 @@ +import datetime +import logging +import os +import subprocess +import tempfile +import uuid +from retrying import retry + +import pytest + +from kubeflow.testing import util +from testing import deploy_utils + +def test_kf_is_ready(namespace, use_basic_auth, use_istio): + """Test that Kubeflow was successfully deployed. + + Args: + namespace: The namespace Kubeflow is deployed to. + """ + + logging.info("Using namespace %s", namespace) + + # Need to activate account for scopes. + if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + util.run(["gcloud", "auth", "activate-service-account", + "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"]]) + + api_client = deploy_utils.create_k8s_client() + + util.load_kube_config() + + # Verify that components are actually deployed. + # TODO(jlewi): We need to parameterize this list based on whether + # we are using IAP or basic auth. + deployment_names = [ + "argo-ui", + "centraldashboard", + "cloud-endpoints-controller", + "jupyter-web-app-deployment", + "metadata-db", + "metadata-deployment", + "metadata-ui", + "ml-pipeline", + "ml-pipeline-scheduledworkflow", + "ml-pipeline-ui", + "notebook-controller-deployment", + "tf-job-operator", + "pytorch-operator", + "katib-controller", + "workflow-controller", + ] + + stateful_set_names = [ + "kfserving-controller-manager", + ] + + ingress_related_deployments = [] + ingress_related_stateful_sets = [] + + if use_basic_auth: + deployment_names.extend(["basic-auth-login"]) + ingress_related_stateful_sets.extend(["backend-updater"]) + else: + ingress_related_deployments.extend(["iap-enabler"]) + ingress_related_stateful_sets.extend(["backend-updater"]) + + # TODO(jlewi): Might want to parallelize this. + for deployment_name in deployment_names: + logging.info("Verifying that deployment %s started...", deployment_name) + util.wait_for_deployment(api_client, namespace, deployment_name, 10) + + for stateful_set_name in stateful_set_names: + logging.info("Verifying that stateful set %s started...", stateful_set_name) + util.wait_for_statefulset(api_client, namespace, stateful_set_name) + + ingress_namespace = "istio-system" if use_istio else namespace + for deployment_name in ingress_related_deployments: + logging.info("Verifying that deployment %s started...", deployment_name) + util.wait_for_deployment(api_client, ingress_namespace, deployment_name, 10) + + for name in ingress_related_stateful_sets: + logging.info("Verifying that statefulset %s started...", name) + util.wait_for_statefulset(api_client, ingress_namespace, name) + + # TODO(jlewi): We should verify that the ingress is created and healthy. + + knative_namespace = "knative-serving" + knative_related_deployments = [ + "activator", + "autoscaler", + "controller", + ] + for deployment_name in knative_related_deployments: + logging.info("Verifying that deployment %s started...", deployment_name) + util.wait_for_deployment(api_client, knative_namespace, deployment_name, 10) + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + pytest.main() diff --git a/testing/e2e/kfctl_delete_test.py b/testing/e2e/kfctl_delete_test.py new file mode 100644 index 000000000..91ce0a9bb --- /dev/null +++ b/testing/e2e/kfctl_delete_test.py @@ -0,0 +1,74 @@ +"""Run kfctl delete as a pytest. + +We use this in order to generate a junit_xml file. +""" +import datetime +import logging +import os +import subprocess +import tempfile +import uuid +from retrying import retry + +import pytest + +from kubeflow.testing import util +from googleapiclient import discovery +from oauth2client.client import GoogleCredentials + +# TODO(gabrielwen): Move this to a separate test "kfctl_go_check_post_delete" +def get_endpoints_list(project): + cred = GoogleCredentials.get_application_default() + services_mgt = discovery.build('servicemanagement', 'v1', credentials=cred) + services = services_mgt.services() + next_page_token = None + endpoints = [] + + while True: + results = services.list(producerProjectId=project, + pageToken=next_page_token).execute() + + for s in results.get("services", {}): + name = s.get("serviceName", "") + endpoints.append(name) + if not "nextPageToken" in results: + break + next_page_token = results["nextPageToken"] + + return endpoints + +def test_kfctl_delete(kfctl_path, app_path, project): + if not kfctl_path: + raise ValueError("kfctl_path is required") + + if not app_path: + raise ValueError("app_path is required") + + logging.info("Using kfctl path %s", kfctl_path) + logging.info("Using app path %s", app_path) + + util.run([kfctl_path, "delete", "all", "--delete_storage", "-V"], + cwd=app_path) + + # Use services.list instead of services.get because error returned is not + # 404, it's 403 which is confusing. + name = os.path.basename(app_path) + endpoint_name = "{deployment}.endpoints.{project}.cloud.goog".format( + deployment=name, + project=project) + logging.info("Verify endpoint service is deleted: " + endpoint_name) + if endpoint_name in get_endpoints_list(project): + msg = "Endpoint is not deleted: " + endpoint_name + logging.error(msg) + raise AssertionError(msg) + else: + logging.info("Verified endpoint service is deleted.") + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + pytest.main() diff --git a/testing/e2e/kfctl_go_test.py b/testing/e2e/kfctl_go_test.py new file mode 100644 index 000000000..18afc5fad --- /dev/null +++ b/testing/e2e/kfctl_go_test.py @@ -0,0 +1,158 @@ +import datetime +import logging +import os +import subprocess +import tempfile +import uuid +from retrying import retry +import yaml + +import pytest + +from kubeflow.testing import util + + +# retry 4 times, waiting 3 minutes between retries +@retry(stop_max_attempt_number=4, wait_fixed=180000) +def run_with_retries(*args, **kwargs): + util.run(*args, **kwargs) + + +def verify_kubeconfig(project, zone, app_path): + name = os.path.basename(app_path) + context = util.run(["kubectl", "config", "current-context"]).strip() + if name == context: + logging.info("KUBECONFIG current context name matches app name: " + name) + else: + msg = "KUBECONFIG not having expected context: {expected} v.s. {actual}".format( + expected=name, actual=context) + logging.error(msg) + raise RuntimeError(msg) + + +def test_build_kfctl_go(app_path, project, use_basic_auth, use_istio, config_path): + """Test building and deploying Kubeflow. + + Args: + app_path: The path to the Kubeflow app. + project: The GCP project to use. + """ + if not app_path: + logging.info("--app_path not specified") + stamp = datetime.datetime.now().strftime("%H%M") + parent_dir = tempfile.gettempdir() + app_path = os.path.join( + parent_dir, "kfctl-{0}-{1}".format(stamp, + uuid.uuid4().hex[0:4])) + else: + parent_dir = os.path.dirname(app_path) + + logging.info("Using app path %s", app_path) + this_dir = os.path.dirname(__file__) + build_dir = os.path.abspath(os.path.join(this_dir, "..", "..")) + zone = 'us-central1-a' + + # Need to activate account for scopes. + if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + util.run([ + "gcloud", "auth", "activate-service-account", + "--key-file=" + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] + ]) + + # We need to use retry builds because when building in the test cluster + # we see intermittent failures pulling dependencies + run_with_retries(["make", "build-kfctl"], cwd=build_dir) + kfctl_path = os.path.join(build_dir, "bin", "kfctl") + + # Set ENV for basic auth username/password. + init_args = [] + if use_basic_auth: + os.environ["KUBEFLOW_USERNAME"] = "kf-test-user" + os.environ["KUBEFLOW_PASSWORD"] = str(uuid.uuid4().hex) + init_args = ["--use_basic_auth"] + else: + # Owned by project kubeflow-ci-deployment. + os.environ["CLIENT_SECRET"] = "CJ4qVPLTi0j0GJMkONj7Quwt" + os.environ["CLIENT_ID"] = ( + "29647740582-7meo6c7a9a76jvg54j0g2lv8lrsb4l8g" + ".apps.googleusercontent.com") + + if use_istio: + init_args.append("--use_istio") + else: + init_args.append("--use_istio=false") + + version = "master" + if os.getenv("REPO_NAME") != "manifests": + if os.getenv("PULL_NUMBER"): + version = "pull/{0}".format(os.getenv("PULL_NUMBER")) + pull_manifests = "@master" + if os.getenv("REPO_NAME") == "manifests": + if os.getenv("PULL_PULL_SHA"): + pull_manifests = "@" + os.getenv("PULL_PULL_SHA") + + # We need to specify a valid email because + # 1. We need to create appropriate RBAC rules to allow the current user + # to create the required K8s resources. + # 2. Setting the IAM policy will fail if the email is invalid. + email = util.run(["gcloud", "config", "get-value", "account"]) + + if not email: + raise ValueError("Could not determine GCP account being used.") + + # username and password are passed as env vars and won't appear in the logs + # TODO(https://github.com/kubeflow/kubeflow/issues/2831): Once kfctl + # supports loading version from a URI we should use that so that we + # pull the configs from the repo we checked out. + # + # We don't run with retries because if kfctl init exits with an error + # but creates app.yaml then rerunning init will fail because app.yaml + # already exists. So retrying ends up masking the original error message + with open(config_path, 'r') as f: + config_spec = yaml.load(f) + config_spec["spec"]["project"] = project + config_spec["spec"]["email"] = email + config_spec["spec"] = filterSpartakus(config_spec["spec"]) + repos = config_spec["spec"]["repos"] + if os.getenv("REPO_NAME") == "manifests": + for repo in repos: + for key, value in repo.items(): + if value == "https://github.com/kubeflow/manifests/archive/master.tar.gz": + repo["uri"] = str("https://github.com/kubeflow/manifests/archive/pull/"+str(os.getenv("PULL_NUMBER"))+"/head.tar.gz") + logging.info(str(config_spec)) + with open(os.path.join(parent_dir, "tmp.yaml"), "w") as f: + yaml.dump(config_spec, f) + util.run([ + kfctl_path, "init", app_path, "-V", + "--config=" + os.path.join(parent_dir, "tmp.yaml")], cwd=parent_dir) + util.run(["cat", "app.yaml"], cwd=app_path) + + run_with_retries([ + kfctl_path, "generate", "-V", "all", "--email=" + email, "--zone=" + zone + ], + cwd=app_path) + + # We need to use retries because if we don't we see random failures + # where kfctl just appears to die. + # + # Do not run with retries since it masks errors + util.run([kfctl_path, "apply", "-V", "all"], cwd=app_path) + + verify_kubeconfig(project, zone, app_path) + +def filterSpartakus(spec): + for i, app in enumerate(spec["applications"]): + if app["name"] == "spartakus": + spec["applications"].pop(i) + break + return spec + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + pytest.main() diff --git a/testing/workflows/components/kfctl_go_test.jsonnet b/testing/workflows/components/kfctl_go_test.jsonnet index 74ab7fae7..8b4281541 100644 --- a/testing/workflows/components/kfctl_go_test.jsonnet +++ b/testing/workflows/components/kfctl_go_test.jsonnet @@ -1,4 +1,4 @@ -// Uses test from kubeflow/kubeflow/testing/workflows/components/kfctl_go_test.jsonnet +// Uses test from kubeflow/kfctl/testing/workflows/components/kfctl_go_test.jsonnet // Any changes should reflect here and there // E2E test for the new go based version of kfctl. @@ -29,11 +29,11 @@ local outputDir = testDir + "/output"; local artifactsDir = outputDir + "/artifacts"; // Source directory where all repos should be checked out local srcRootDir = testDir + "/src"; -// The directory containing the kubeflow/kubeflow repo -local srcDir = srcRootDir + "/kubeflow/kubeflow"; +// The directory containing the kubeflow/kfctl repo +local srcDir = srcRootDir + "/kubeflow/kfctl"; local runPath = srcDir + "/testing/workflows/run.sh"; -local kfCtlPath = srcDir + "/bootstrap/bin/kfctl"; +local kfCtlPath = srcDir + "/bin/kfctl"; local kubeConfig = testDir + "/kfctl_test/.kube/kubeconfig"; // Name for the Kubeflow app. @@ -202,7 +202,7 @@ local dagTemplates = [ env_vars=[{ name: "EXTRA_REPOS", // TODO(jlewi): Stop pinning to 341 once its submitted. - value: "kubeflow/kubeflow@HEAD;kubeflow/tf-operator@HEAD;kubeflow/testing@HEAD", + value: "kubeflow/tf-operator@HEAD;kubeflow/testing@HEAD", }]), dependencies: null, }, // checkout @@ -236,7 +236,7 @@ local dagTemplates = [ "-o", "junit_suite_name=test_kfctl_go_deploy_" + nameSuffix, "--app_path=" + appDir, ], - working_dir=srcDir+ "/testing/kfctl", + working_dir=srcDir+ "/testing/e2e", ), dependencies: ["checkout"], }, diff --git a/testing/workflows/components/kubeflow_workflow.libsonnet b/testing/workflows/components/kubeflow_workflow.libsonnet index ad466b386..c39f4d802 100644 --- a/testing/workflows/components/kubeflow_workflow.libsonnet +++ b/testing/workflows/components/kubeflow_workflow.libsonnet @@ -77,8 +77,8 @@ artifactsDir: self.outputDir + "/artifacts", // Source directory where all repos should be checked out srcRootDir: self.testDir + "/src", - // The directory containing the kubeflow/kubeflow repo - srcDir: self.srcRootDir + "/kubeflow/kubeflow", + // The directory containing the kubeflow/kfct repo + srcDir: self.srcRootDir + "/kubeflow/kfctl", image: "gcr.io/kubeflow-ci/test-worker:latest", // value of KUBECONFIG environment variable. This should be a full path. @@ -409,11 +409,9 @@ local artifactsDir = outputDir + "/artifacts"; // Source directory where all repos should be checked out local srcRootDir = testDir + "/src"; - // The directory containing the kubeflow/kubeflow repo - local srcDir = srcRootDir + "/kubeflow/kubeflow"; - local bootstrapDir = srcDir + "/bootstrap"; + // The directory containing the kubeflow/kfctl repo + local srcDir = srcRootDir + "/kubeflow/kfctl"; local image = "gcr.io/kubeflow-ci/test-worker:latest"; - local bootstrapperImage = "gcr.io/kubeflow-ci/bootstrapper:" + name; // The last 4 digits of the name should be a unique id. local deploymentName = "e2e-" + std.substr(name, std.length(name) - 4, 4); local v1alpha1Suffix = "-v1alpha1"; @@ -653,7 +651,7 @@ ["/usr/local/bin/checkout.sh", srcRootDir], env_vars=[{ name: "EXTRA_REPOS", - value: "kubeflow/tf-operator@HEAD;kubeflow/testing@HEAD", + value: "kubeflow/kubeflow@HEAD;kubeflow/tf-operator@HEAD;kubeflow/testing@HEAD", }], ), buildTemplate("test-dir-delete", [ diff --git a/testing/workflows/run.sh b/testing/workflows/run.sh new file mode 100755 index 000000000..3b92615a0 --- /dev/null +++ b/testing/workflows/run.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# +# A simple wrapper script to run a command in the e2e tests. +# This script performs common functions like +# activating the service account. +set -ex +gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} + +echo Working Directory=$(pwd) +# Execute the actual command. +# TODO(jlewi): We should add retries on error. + +# Retry up to 3 times +for i in $(seq 1 3); do + set +e + "$@" + result=$? + set -e + if [[ ${result} -eq 0 ]]; then + echo command ran successfully + exit 0 + fi + + echo Command failed: "$@" +done +echo "command didn't succeed" +exit 1