diff --git a/py/kubeflow/testing/cleanup_ci.py b/py/kubeflow/testing/cleanup_ci.py index 536dcd3a2..b29fcfa7f 100644 --- a/py/kubeflow/testing/cleanup_ci.py +++ b/py/kubeflow/testing/cleanup_ci.py @@ -17,19 +17,26 @@ from googleapiclient import discovery from oauth2client.client import GoogleCredentials +# See https://github.com/kubeflow/testing/issues/444 +# We are switching to unique names for auto deployments +# So this matches the new ones. +AUTO_DEPLOY_PATTERN = re.compile("kf-vmaster-(?!n\d\d)") + # Regexes that select matching deployments MATCHING = [re.compile("e2e-.*"), re.compile("kfctl.*"), - re.compile("z-.*"), re.compile(".*presubmit.*")] + re.compile("z-.*"), re.compile(".*presubmit.*"), + AUTO_DEPLOY_PATTERN] MATCHING_FIREWALL_RULES = [re.compile("gke-kfctl-.*"), re.compile("gke-e2e-.*"), re.compile(".*presubmit.*"), - re.compile(".*postsubmit.*")] + re.compile(".*postsubmit.*"), + AUTO_DEPLOY_PATTERN] # Regexes that select matching disks MATCHING_DISK = [re.compile(".*jlewi.*"), re.compile(".*kfctl.*"), re.compile(".*postsubmit.*"), re.compile(".*presubmit.*"), - ] + AUTO_DEPLOY_PATTERN] def is_match_disk(name): for m in MATCHING_DISK: diff --git a/py/kubeflow/testing/create_kf_instance.py b/py/kubeflow/testing/create_kf_instance.py index b8fc8bdfc..fc54fb384 100644 --- a/py/kubeflow/testing/create_kf_instance.py +++ b/py/kubeflow/testing/create_kf_instance.py @@ -2,6 +2,9 @@ The purpose of this script is to automate the creation of Kubeflow Deployments corresponding to different versions of Kubeflow. + +TODO: This script is obsolete; we should get rid of it in favor of + create_unique_kf_instance.py. """ import argparse import logging @@ -106,18 +109,22 @@ def deploy_with_kfctl_go(kfctl_path, args, app_dir, env): config_spec["spec"] = util.filter_spartakus(config_spec["spec"]) + # Remove name because we will auto infer from directory. + if "name" in config_file["metadata"]: + logging.info("Deleting name in kfdef spec.") + del config_spec["metadata"]["name"] + logging.info("KFDefSpec:\n%s", str(config_spec)) - with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: - config_file = f.name - logging.info("Writing file %s", f.name) - yaml.dump(config_spec, f) - util.run([kfctl_path, "init", app_dir, "-V", "--config=" + config_file], - env=env) + if not os.path.exists(app_dir): + logging.info("Creating app dir %s", app_dir) - util.run([kfctl_path, "generate", "-V", "all"], env=env, cwd=app_dir) + config_file = os.path.join(app_dir, "kf_config.yaml") + with open(config_file, "w") as hf: + logging.info("Writing file %s", config_file) + yaml.dump(config_spec, hf) - util.run([kfctl_path, "apply", "-V", "all"], env=env, cwd=app_dir) + util.run([kfctl_path, "apply", "-V", "-f", config_file], env=env) def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig(level=logging.INFO, diff --git a/py/kubeflow/testing/create_unique_kf_instance.py b/py/kubeflow/testing/create_unique_kf_instance.py new file mode 100644 index 000000000..99d4de4ea --- /dev/null +++ b/py/kubeflow/testing/create_unique_kf_instance.py @@ -0,0 +1,201 @@ +"""Create a Kubeflow instance. + +The purpose of this script is to automate the creation of Kubeflow Deployments +corresponding to different versions of Kubeflow. + +This script should replace create_kf_instance. Unlike create_kf_instance +we no longer need to recycle kf app names because of IAP so we can +use unique names which greatly simplifies things. +This greatly simplifieds things. In particular, we don't need to do any +cleanup in this script because we will rely on cleanup_ci to GC old auto +deployments. +""" +import argparse +import datetime +import logging +import json +import os +import re +import requests +import shutil +import subprocess +import tempfile +import uuid +import yaml + +from googleapiclient import discovery +from google.cloud import storage +from kubeflow.testing import util +from retrying import retry +from oauth2client.client import GoogleCredentials + +@retry(wait_fixed=60000, stop_max_attempt_number=5) +def run_with_retry(*args, **kwargs): + util.run(*args, **kwargs) + +def build_kfctl_go(args): + """Build kfctl go.""" + build_dir = os.path.join(args.kubeflow_repo, "bootstrap") + # We need to use retry builds because when building in the test cluster + # we see intermittent failures pulling dependencies + util.run(["make", "build-kfctl"], cwd=build_dir) + kfctl_path = os.path.join(build_dir, "bin", "kfctl") + + return kfctl_path + +def deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=None): + """Deploy Kubeflow using kfctl go binary.""" + # username and password are passed as env vars and won't appear in the logs + # + # We need to edit and rewrite the config file to the app dir because + # kfctl uses the path of the config file as the app dir.s + logging.warning("Loading configs %s.", args.kfctl_config) + + if args.kfctl_config.startswith("http"): + response = requests.get(args.kfctl_config) + raw_config = response.content + else: + with open(args.kfctl_config) as hf: + raw_config = hf.read() + + config_spec = yaml.load(raw_config) + + # We need to specify a valid email because + # 1. We need to create appropriate RBAC rules to allow the current user + # to create the required K8s resources. + # 2. Setting the IAM policy will fail if the email is invalid. + email = util.run(["gcloud", "config", "get-value", "account"]) + + if not email: + raise ValueError("Could not determine GCP account being used.") + + config_spec["spec"]["project"] = args.project + config_spec["spec"]["email"] = email + config_spec["spec"]["zone"] = args.zone + + config_spec["spec"] = util.filter_spartakus(config_spec["spec"]) + + # Remove name because we will auto infer from directory. + if "name" in config_spec["metadata"]: + logging.info("Deleting name in kfdef spec.") + del config_spec["metadata"]["name"] + + if not "labels" in config_spec["metadata"]: + config_spec["metadata"]["labels"] = {} + + if labels: + config_spec["metadata"]["labels"].update(labels) + + logging.info("KFDefSpec:\n%s", yaml.safe_dump(config_spec)) + + if not os.path.exists(app_dir): + logging.info("Creating app dir %s", app_dir) + os.makedirs(app_dir) + + config_file = os.path.join(app_dir, "kf_config.yaml") + with open(config_file, "w") as hf: + logging.info("Writing file %s", config_file) + yaml.dump(config_spec, hf) + + util.run([kfctl_path, "apply", "-V", "-f", config_file], env=env) + +def main(): # pylint: disable=too-many-locals,too-many-statements + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--project", default="kubeflow-ci-deployment", type=str, + help=("The project.")) + + parser.add_argument( + "--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) + parser.add_argument( + "--oauth_file", + default=("gs://kubeflow-ci-deployment_kf-data/" + "kf-iap-oauth.kubeflow-ci-deployment.yaml"), + type=str, help=("The file containing the OAuth client ID & secret" + "for IAP.")) + + parser.add_argument( + "--kubeflow_repo", + default="/src/kubeflow/kubeflow", + type=str, help=("Path to the Kubeflow repo to use")) + + parser.add_argument( + "--kfctl_config", + default=("https://raw.githubusercontent.com/kubeflow/manifests" + "/master/kfdef/kfctl_gcp_iap.yaml"), + type=str, help=("Path to the kfctl config to use")) + + parser.add_argument( + "--apps_dir", + default=os.getcwd(), + type=str, help=("Directory to store kubeflow apps.")) + + parser.add_argument( + "--name", type=str, default="kf-vmaster-{uid}", + help=("Name for the deployment. This can be a python format string " + "with the variable uid. Uid will automatically be substituted " + "for a unique value based on the time.")) + + parser.add_argument( + "--job_name", + default="", type=str, help=("Pod name running the job.")) + + args = parser.parse_args() + + util.maybe_activate_service_account() + + bucket, blob_path = util.split_gcs_uri(args.oauth_file) + + client = storage.Client(project=args.project) + bucket = client.get_bucket(bucket) + + blob = bucket.get_blob(blob_path) + contents = blob.download_as_string() + + oauth_info = yaml.load(contents) + + git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"], + cwd=args.kubeflow_repo).strip("'") + + kfctl_path = build_kfctl_go(args) + + # We need to keep the name short to avoid hitting limits with certificates. + uid = datetime.datetime.now().strftime("%m%d") + "-" + uid = uid + uuid.uuid4().hex[0:3] + + args.name = args.name.format(uid=uid) + logging.info("Using name %s", args.name) + + app_dir = os.path.join(args.apps_dir, args.name) + + if not os.path.exists(args.apps_dir): + os.makedirs(args.apps_dir) + + env = {} + env.update(os.environ) + env.update(oauth_info) + + labels = { "GIT_LABEL": git_describe, + "PURPOSE": "kf-test-cluster", + } + + label_args = [] + for k, v in labels.items(): + # labels can only take as input alphanumeric characters, hyphens, and + # underscores. Replace not valid characters with hyphens. + val = v.lower().replace("\"", "") + val = re.sub(r"[^a-z0-9\-_]", "-", val) + label_args.append("{key}={val}".format(key=k.lower(), val=val)) + + deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels) + +if __name__ == "__main__": + main() diff --git a/test-infra/auto-deploy/deploy-cron-master.yaml b/test-infra/auto-deploy/deploy-cron-master.yaml index d6165d346..27525908b 100644 --- a/test-infra/auto-deploy/deploy-cron-master.yaml +++ b/test-infra/auto-deploy/deploy-cron-master.yaml @@ -12,56 +12,57 @@ spec: backoffLimit: 2 template: spec: + initContainers: + - command: + - /usr/local/bin/checkout_repos.sh + # Stop using PR #495 once its subbmitted + - --repos=kubeflow/kubeflow@HEAD,kubeflow/testing@HEAD:495 + - --src_dir=/src + env: + - name: PYTHONPATH + value: /src/kubeflow/testing/py + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-credentials/key.json + image: gcr.io/kubeflow-ci/test-worker@sha256:dd559f89b3cbd926ec563559995f25025eecc6290b3146f17f82d2f084d07ee2 + imagePullPolicy: IfNotPresent + name: checkout + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /secret/gcp-credentials + name: gcp-credentials + readOnly: true + - mountPath: /src + name: src containers: - - name: deploy-worker - image: gcr.io/kubeflow-ci/deploy-worker:v20190819-8723ec6-e3b0c4 + - name: deploy + image: gcr.io/kubeflow-ci/test-worker@sha256:dd559f89b3cbd926ec563559995f25025eecc6290b3146f17f82d2f084d07ee2 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /secret/gcp-credentials/key.json + - name: PYTHONPATH + value: /src/kubeflow/testing/py command: - - /usr/local/bin/auto_deploy.sh - - --repos=kubeflow/kubeflow;kubeflow/testing + - python + - -m + - kubeflow.testing.create_unique_kf_instance + - --apps_dir=/src/apps + - --kubeflow_repo=/src/kubeflow/kubeflow + - --name=kf-vmaster-{uid} - --project=kubeflow-ci-deployment - - --job_labels=/etc/pod-info/labels - - --data_dir=/mnt/test-data-volume/auto_deploy - - --base_name=kf-vmaster - - --max_num_cluster=5 - - --zone=us-east1-b - - --github_token_file=/secret/github-token/github_token + - --zone=us-central1-a - --kfctl_config=https://raw.githubusercontent.com/kubeflow/manifests/master/kfdef/kfctl_gcp_iap.yaml volumeMounts: - name: gcp-credentials mountPath: /secret/gcp-credentials readOnly: true - - name: oauth-secret - mountPath: /secret/oauth-secret - readOnly: true - - name: pod-info - mountPath: /etc/pod-info - readOnly: true - - name: github-token - mountPath: /secret/github-token - readOnly: true - - name: test-data-volume - mountPath: /mnt/test-data-volume - readOnly: false + - mountPath: /src + name: src restartPolicy: Never volumes: - name: gcp-credentials secret: secretName: gcp-credentials - - name: oauth-secret - secret: - secretName: kubeflow-ci-deployment-iap-testing-oauth - - name: github-token - secret: - secretName: github-token - - name: test-data-volume - persistentVolumeClaim: - claimName: nfs-external - - name: pod-info - downwardAPI: - items: - - path: labels - fieldRef: - fieldPath: metadata.labels + - name: src + emptyDir: {} \ No newline at end of file diff --git a/test-infra/auto-deploy/deploy-master.yaml b/test-infra/auto-deploy/deploy-master.yaml index ea14cf1c1..99e30c1c0 100644 --- a/test-infra/auto-deploy/deploy-master.yaml +++ b/test-infra/auto-deploy/deploy-master.yaml @@ -1,63 +1,73 @@ +# This version of the script has been updated to use unique names +# See https://github.com/kubeflow/testing/issues/444 +# Its also using an init container to check out the code. apiVersion: batch/v1 kind: Job metadata: generateName: deploy-master- namespace: kubeflow-test-infra labels: - app: deploy-master + job: deploy-master-oneoff version: master spec: backoffLimit: 1 template: metadata: labels: - job: deploy-master + job: deploy-master-oneoff version: master spec: + initContainers: + - command: + - /usr/local/bin/checkout_repos.sh + # Stop using PR #495 once its subbmitted + - --repos=kubeflow/kubeflow@HEAD,kubeflow/testing@HEAD:495 + - --src_dir=/src + env: + - name: PYTHONPATH + value: /src/kubeflow/testing/py + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-credentials/key.json + image: gcr.io/kubeflow-ci/test-worker@sha256:dd559f89b3cbd926ec563559995f25025eecc6290b3146f17f82d2f084d07ee2 + imagePullPolicy: IfNotPresent + name: checkout + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /secret/gcp-credentials + name: gcp-credentials + readOnly: true + - mountPath: /src + name: src containers: - - name: deploy-worker - image: gcr.io/kubeflow-ci/deploy-worker:v20190819-8723ec6-e3b0c4 + - name: deploy + image: gcr.io/kubeflow-ci/test-worker@sha256:dd559f89b3cbd926ec563559995f25025eecc6290b3146f17f82d2f084d07ee2 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /secret/gcp-credentials/key.json + - name: PYTHONPATH + value: /src/kubeflow/testing/py command: - - /usr/local/bin/auto_deploy.sh - - --repos=kubeflow/kubeflow;kubeflow/testing + - python + - -m + - kubeflow.testing.create_unique_kf_instance + - --apps_dir=/src/apps + - --kubeflow_repo=/src/kubeflow/kubeflow + - --name=kf-vmaster-{uid} - --project=kubeflow-ci-deployment - - --job_labels=/etc/pod-info/labels - - --data_dir=/mnt/test-data-volume/auto_deploy - - --base_name=kf-vmaster - - --max_num_cluster=5 - - --zone=us-east1-b - - --github_token_file=/secret/github-token/github_token + - --zone=us-central1-a - --kfctl_config=https://raw.githubusercontent.com/kubeflow/manifests/master/kfdef/kfctl_gcp_iap.yaml volumeMounts: - name: gcp-credentials mountPath: /secret/gcp-credentials readOnly: true - - name: pod-info - mountPath: /etc/pod-info - readOnly: true - - name: github-token - mountPath: /secret/github-token - readOnly: true - - name: test-data-volume - mountPath: /mnt/test-data-volume - readOnly: false + - mountPath: /src + name: src restartPolicy: Never volumes: - name: gcp-credentials secret: secretName: gcp-credentials - - name: github-token - secret: - secretName: github-token - - name: test-data-volume - persistentVolumeClaim: - claimName: nfs-external - - name: pod-info - downwardAPI: - items: - - path: labels - fieldRef: - fieldPath: metadata.labels + - name: src + emptyDir: {}