Skip to content

Commit

Permalink
Auto deploy job needs to use the new kfctl syntax; also use unique names
Browse files Browse the repository at this point in the history
Related to kubeflow#471

* Don't set name in the spec because we want to infer it form directory.

* Create a new script to deploy with a unique name

* Related to: kubeflow#444

* Update cleanup script to clean up new auto-deployed clusters
  • Loading branch information
Jeremy Lewi committed Oct 18, 2019
1 parent f8302c8 commit bc78726
Show file tree
Hide file tree
Showing 5 changed files with 307 additions and 81 deletions.
13 changes: 10 additions & 3 deletions py/kubeflow/testing/cleanup_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,26 @@
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials

# See https://github.com/kubeflow/testing/issues/444
# We are switching to unique names for auto deployments
# So this matches the new ones.
AUTO_DEPLOY_PATTERN = re.compile("kf-vmaster-(?!n\d\d)")

# Regexes that select matching deployments
MATCHING = [re.compile("e2e-.*"), re.compile("kfctl.*"),
re.compile("z-.*"), re.compile(".*presubmit.*")]
re.compile("z-.*"), re.compile(".*presubmit.*"),
AUTO_DEPLOY_PATTERN]

MATCHING_FIREWALL_RULES = [re.compile("gke-kfctl-.*"),
re.compile("gke-e2e-.*"),
re.compile(".*presubmit.*"),
re.compile(".*postsubmit.*")]
re.compile(".*postsubmit.*"),
AUTO_DEPLOY_PATTERN]

# Regexes that select matching disks
MATCHING_DISK = [re.compile(".*jlewi.*"), re.compile(".*kfctl.*"),
re.compile(".*postsubmit.*"), re.compile(".*presubmit.*"),
]
AUTO_DEPLOY_PATTERN]

def is_match_disk(name):
for m in MATCHING_DISK:
Expand Down
23 changes: 15 additions & 8 deletions py/kubeflow/testing/create_kf_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
The purpose of this script is to automate the creation of Kubeflow Deployments
corresponding to different versions of Kubeflow.
TODO: This script is obsolete; we should get rid of it in favor of
create_unique_kf_instance.py.
"""
import argparse
import logging
Expand Down Expand Up @@ -106,18 +109,22 @@ def deploy_with_kfctl_go(kfctl_path, args, app_dir, env):

config_spec["spec"] = util.filter_spartakus(config_spec["spec"])

# Remove name because we will auto infer from directory.
if "name" in config_file["metadata"]:
logging.info("Deleting name in kfdef spec.")
del config_spec["metadata"]["name"]

logging.info("KFDefSpec:\n%s", str(config_spec))
with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f:
config_file = f.name
logging.info("Writing file %s", f.name)
yaml.dump(config_spec, f)

util.run([kfctl_path, "init", app_dir, "-V", "--config=" + config_file],
env=env)
if not os.path.exists(app_dir):
logging.info("Creating app dir %s", app_dir)

util.run([kfctl_path, "generate", "-V", "all"], env=env, cwd=app_dir)
config_file = os.path.join(app_dir, "kf_config.yaml")
with open(config_file, "w") as hf:
logging.info("Writing file %s", config_file)
yaml.dump(config_spec, hf)

util.run([kfctl_path, "apply", "-V", "all"], env=env, cwd=app_dir)
util.run([kfctl_path, "apply", "-V", "-f", config_file], env=env)

def main(): # pylint: disable=too-many-locals,too-many-statements
logging.basicConfig(level=logging.INFO,
Expand Down
201 changes: 201 additions & 0 deletions py/kubeflow/testing/create_unique_kf_instance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"""Create a Kubeflow instance.
The purpose of this script is to automate the creation of Kubeflow Deployments
corresponding to different versions of Kubeflow.
This script should replace create_kf_instance. Unlike create_kf_instance
we no longer need to recycle kf app names because of IAP so we can
use unique names which greatly simplifies things.
This greatly simplifieds things. In particular, we don't need to do any
cleanup in this script because we will rely on cleanup_ci to GC old auto
deployments.
"""
import argparse
import datetime
import logging
import json
import os
import re
import requests
import shutil
import subprocess
import tempfile
import uuid
import yaml

from googleapiclient import discovery
from google.cloud import storage
from kubeflow.testing import util
from retrying import retry
from oauth2client.client import GoogleCredentials

@retry(wait_fixed=60000, stop_max_attempt_number=5)
def run_with_retry(*args, **kwargs):
util.run(*args, **kwargs)

def build_kfctl_go(args):
"""Build kfctl go."""
build_dir = os.path.join(args.kubeflow_repo, "bootstrap")
# We need to use retry builds because when building in the test cluster
# we see intermittent failures pulling dependencies
util.run(["make", "build-kfctl"], cwd=build_dir)
kfctl_path = os.path.join(build_dir, "bin", "kfctl")

return kfctl_path

def deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=None):
"""Deploy Kubeflow using kfctl go binary."""
# username and password are passed as env vars and won't appear in the logs
#
# We need to edit and rewrite the config file to the app dir because
# kfctl uses the path of the config file as the app dir.s
logging.warning("Loading configs %s.", args.kfctl_config)

if args.kfctl_config.startswith("http"):
response = requests.get(args.kfctl_config)
raw_config = response.content
else:
with open(args.kfctl_config) as hf:
raw_config = hf.read()

config_spec = yaml.load(raw_config)

# We need to specify a valid email because
# 1. We need to create appropriate RBAC rules to allow the current user
# to create the required K8s resources.
# 2. Setting the IAM policy will fail if the email is invalid.
email = util.run(["gcloud", "config", "get-value", "account"])

if not email:
raise ValueError("Could not determine GCP account being used.")

config_spec["spec"]["project"] = args.project
config_spec["spec"]["email"] = email
config_spec["spec"]["zone"] = args.zone

config_spec["spec"] = util.filter_spartakus(config_spec["spec"])

# Remove name because we will auto infer from directory.
if "name" in config_spec["metadata"]:
logging.info("Deleting name in kfdef spec.")
del config_spec["metadata"]["name"]

if not "labels" in config_spec["metadata"]:
config_spec["metadata"]["labels"] = {}

if labels:
config_spec["metadata"]["labels"].update(labels)

logging.info("KFDefSpec:\n%s", yaml.safe_dump(config_spec))

if not os.path.exists(app_dir):
logging.info("Creating app dir %s", app_dir)
os.makedirs(app_dir)

config_file = os.path.join(app_dir, "kf_config.yaml")
with open(config_file, "w") as hf:
logging.info("Writing file %s", config_file)
yaml.dump(config_spec, hf)

util.run([kfctl_path, "apply", "-V", "-f", config_file], env=env)

def main(): # pylint: disable=too-many-locals,too-many-statements
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)

parser = argparse.ArgumentParser()

parser.add_argument(
"--project", default="kubeflow-ci-deployment", type=str,
help=("The project."))

parser.add_argument(
"--zone", default="us-east1-d", type=str, help=("The zone to deploy in."))
parser.add_argument(
"--oauth_file",
default=("gs://kubeflow-ci-deployment_kf-data/"
"kf-iap-oauth.kubeflow-ci-deployment.yaml"),
type=str, help=("The file containing the OAuth client ID & secret"
"for IAP."))

parser.add_argument(
"--kubeflow_repo",
default="/src/kubeflow/kubeflow",
type=str, help=("Path to the Kubeflow repo to use"))

parser.add_argument(
"--kfctl_config",
default=("https://raw.githubusercontent.com/kubeflow/manifests"
"/master/kfdef/kfctl_gcp_iap.yaml"),
type=str, help=("Path to the kfctl config to use"))

parser.add_argument(
"--apps_dir",
default=os.getcwd(),
type=str, help=("Directory to store kubeflow apps."))

parser.add_argument(
"--name", type=str, default="kf-vmaster-{uid}",
help=("Name for the deployment. This can be a python format string "
"with the variable uid. Uid will automatically be substituted "
"for a unique value based on the time."))

parser.add_argument(
"--job_name",
default="", type=str, help=("Pod name running the job."))

args = parser.parse_args()

util.maybe_activate_service_account()

bucket, blob_path = util.split_gcs_uri(args.oauth_file)

client = storage.Client(project=args.project)
bucket = client.get_bucket(bucket)

blob = bucket.get_blob(blob_path)
contents = blob.download_as_string()

oauth_info = yaml.load(contents)

git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"],
cwd=args.kubeflow_repo).strip("'")

kfctl_path = build_kfctl_go(args)

# We need to keep the name short to avoid hitting limits with certificates.
uid = datetime.datetime.now().strftime("%m%d") + "-"
uid = uid + uuid.uuid4().hex[0:3]

args.name = args.name.format(uid=uid)
logging.info("Using name %s", args.name)

app_dir = os.path.join(args.apps_dir, args.name)

if not os.path.exists(args.apps_dir):
os.makedirs(args.apps_dir)

env = {}
env.update(os.environ)
env.update(oauth_info)

labels = { "GIT_LABEL": git_describe,
"PURPOSE": "kf-test-cluster",
}

label_args = []
for k, v in labels.items():
# labels can only take as input alphanumeric characters, hyphens, and
# underscores. Replace not valid characters with hyphens.
val = v.lower().replace("\"", "")
val = re.sub(r"[^a-z0-9\-_]", "-", val)
label_args.append("{key}={val}".format(key=k.lower(), val=val))

deploy_with_kfctl_go(kfctl_path, args, app_dir, env, labels=labels)

if __name__ == "__main__":
main()
75 changes: 38 additions & 37 deletions test-infra/auto-deploy/deploy-cron-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,56 +12,57 @@ spec:
backoffLimit: 2
template:
spec:
initContainers:
- command:
- /usr/local/bin/checkout_repos.sh
# Stop using PR #495 once its subbmitted
- --repos=kubeflow/kubeflow@HEAD,kubeflow/testing@HEAD:495
- --src_dir=/src
env:
- name: PYTHONPATH
value: /src/kubeflow/testing/py
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /secret/gcp-credentials/key.json
image: gcr.io/kubeflow-ci/test-worker@sha256:dd559f89b3cbd926ec563559995f25025eecc6290b3146f17f82d2f084d07ee2
imagePullPolicy: IfNotPresent
name: checkout
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /secret/gcp-credentials
name: gcp-credentials
readOnly: true
- mountPath: /src
name: src
containers:
- name: deploy-worker
image: gcr.io/kubeflow-ci/deploy-worker:v20190819-8723ec6-e3b0c4
- name: deploy
image: gcr.io/kubeflow-ci/test-worker@sha256:dd559f89b3cbd926ec563559995f25025eecc6290b3146f17f82d2f084d07ee2
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /secret/gcp-credentials/key.json
- name: PYTHONPATH
value: /src/kubeflow/testing/py
command:
- /usr/local/bin/auto_deploy.sh
- --repos=kubeflow/kubeflow;kubeflow/testing
- python
- -m
- kubeflow.testing.create_unique_kf_instance
- --apps_dir=/src/apps
- --kubeflow_repo=/src/kubeflow/kubeflow
- --name=kf-vmaster-{uid}
- --project=kubeflow-ci-deployment
- --job_labels=/etc/pod-info/labels
- --data_dir=/mnt/test-data-volume/auto_deploy
- --base_name=kf-vmaster
- --max_num_cluster=5
- --zone=us-east1-b
- --github_token_file=/secret/github-token/github_token
- --zone=us-central1-a
- --kfctl_config=https://raw.githubusercontent.com/kubeflow/manifests/master/kfdef/kfctl_gcp_iap.yaml
volumeMounts:
- name: gcp-credentials
mountPath: /secret/gcp-credentials
readOnly: true
- name: oauth-secret
mountPath: /secret/oauth-secret
readOnly: true
- name: pod-info
mountPath: /etc/pod-info
readOnly: true
- name: github-token
mountPath: /secret/github-token
readOnly: true
- name: test-data-volume
mountPath: /mnt/test-data-volume
readOnly: false
- mountPath: /src
name: src
restartPolicy: Never
volumes:
- name: gcp-credentials
secret:
secretName: gcp-credentials
- name: oauth-secret
secret:
secretName: kubeflow-ci-deployment-iap-testing-oauth
- name: github-token
secret:
secretName: github-token
- name: test-data-volume
persistentVolumeClaim:
claimName: nfs-external
- name: pod-info
downwardAPI:
items:
- path: labels
fieldRef:
fieldPath: metadata.labels
- name: src
emptyDir: {}
Loading

0 comments on commit bc78726

Please sign in to comment.