Script to garbage collect leaked resources (kubeflow#189)

* Create a script to cleanup leaked resources in kubeflow-ci. * Add a README.md * Fix lint. * Fix lint. * Update regex to match chainer tests.
Linchin · Aug 14, 2018 · 9e5956e · 9e5956e
1 parent e5eb5e4
commit 9e5956e
Show file tree

Hide file tree

Showing 3 changed files with 217 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -245,6 +245,20 @@ To monitor the job open Prow's UI by navigating to the external IP
 associated with the ingress for your Prow cluster or using
 kubectl proxy.
 
+## Cleaning up leaked resources
+
+Test failures sometimes leave resources (GCP deployments, VMs, GKE clusters) still running. 
+The following scripts can be used to garbage collect leaked resources.
+
+```
+py/testing/kubeflow/testing/cleanup_ci.py --delete_script=${DELETE_SCRIPT}
+```
+
+  * **DELETE_SCRIPT** should be the path to a copy of [delete_deployment.sh](https://github.com/kubeflow/kubeflow/blob/master/scripts/gke/delete_deployment.sh)
+
+There's a second script [cleanup_kubeflow_ci](https://github.com/kubeflow/kubeflow/blob/master/scripts/cleanup_kubeflow_ci.sh)
+in the kubeflow repository to cleanup resources left by ingresses.
+
 ## Integration with K8s Prow Infrastructure.
 
 We rely on K8s instance of Prow to actually run our jobs.

diff --git a/...low-org/ks-app/.ksonnet/registries/kubeflow/3b5c7861bdfb8a4f73d583f08d7ca8c9066d9190.yaml b/...low-org/ks-app/.ksonnet/registries/kubeflow/3b5c7861bdfb8a4f73d583f08d7ca8c9066d9190.yaml
@@ -0,0 +1,39 @@
+apiVersion: 0.1.0
+gitVersion:
+  commitSha: 3b5c7861bdfb8a4f73d583f08d7ca8c9066d9190
+  refSpec: v0.2-branch
+kind: ksonnet.io/registry
+libraries:
+  argo:
+    path: argo
+    version: master
+  automation:
+    path: automation
+    version: master
+  core:
+    path: core
+    version: master
+  katib:
+    path: katib
+    version: master
+  mpi-job:
+    path: mpi-job
+    version: master
+  new-package-stub:
+    path: new-package-stub
+    version: master
+  openmpi:
+    path: openmpi
+    version: master
+  pachyderm:
+    path: pachyderm
+    version: master
+  pytorch-job:
+    path: pytorch-job
+    version: master
+  seldon:
+    path: seldon
+    version: master
+  tf-serving:
+    path: tf-serving
+    version: master
diff --git a/py/kubeflow/testing/cleanup_ci.py b/py/kubeflow/testing/cleanup_ci.py
@@ -0,0 +1,164 @@
+"""Cleanup Kubeflow deployments in our ci system."""
+import argparse
+import datetime
+import logging
+import os
+import re
+import subprocess
+import tempfile
+
+from googleapiclient import discovery
+from oauth2client.client import GoogleCredentials
+
+# Regexes that select matching deployments
+MATCHING = [re.compile("e2e-.*"), re.compile("kfctl.*"),
+            re.compile("z-.*"), re.compile(".*presubmit.*")]
+
+def is_match(name):
+  for m in MATCHING:
+    if m.match(name):
+      return True
+
+  return False
+
+def main(): # pylint: disable=too-many-locals,too-many-statements
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+    "--project", default="kubeflow-ci", type=str, help=("The project."))
+
+  parser.add_argument(
+    "--max_age_hours", default=3, type=int, help=("The age of deployments to gc."))
+
+  parser.add_argument(
+    "--update_first", default=False, type=bool,
+    help="Whether to update the deployment first.")
+
+  parser.add_argument(
+    "--delete_script", default="", type=str,
+    help=("The path to the delete_deployment.sh script which is in the "
+          "Kubeflow repository."))
+
+  parser.add_argument(
+    "--zones", default="us-east1-d,us-central1-a", type=str,
+    help="Comma separated list of zones to check.")
+
+  args = parser.parse_args()
+
+  if not args.delete_script:
+    raise ValueError("--delete_script must be specified.")
+
+  logging.basicConfig(level=logging.INFO,
+                      format=('%(levelname)s|%(asctime)s'
+                              '|%(pathname)s|%(lineno)d| %(message)s'),
+                      datefmt='%Y-%m-%dT%H:%M:%S',
+                      )
+  logging.getLogger().setLevel(logging.INFO)
+
+  credentials = GoogleCredentials.get_application_default()
+  dm = discovery.build("deploymentmanager", "v2", credentials=credentials)
+
+  manifests_client = dm.manifests()
+  deployments_client = dm.deployments()
+  deployments = deployments_client.list(project=args.project).execute()
+
+  for d in deployments.get("deployments", []):
+    if not d.get("insertTime", None):
+      logging.warning("Deployment %s doesn't have a deployment time "
+                      "skipping it", d["name"])
+      continue
+
+    name = d["name"]
+
+    if not is_match(name):
+      logging.info("Skipping Deployment %s; it does not match expected names.",
+                   name)
+      continue
+
+    full_insert_time = d.get("insertTime")
+    # The docs say insert time will be in RFC339 format.
+    # But it looks like it also includes a time zone offset in the form
+    # -HH:MM; which is slightly different from what datetime strftime %z
+    # uses (no colon).
+    # https://cloud.google.com/deployment-manager/docs/reference/latest/deployments/insert
+    #
+    # So we parse out the ours.
+    insert_time_str = full_insert_time[:-6]
+    tz_offset = full_insert_time[-6:]
+    hours_offset = int(tz_offset.split(":", 1)[0])
+    RFC3339 = "%Y-%m-%dT%H:%M:%S.%f"
+    insert_time = datetime.datetime.strptime(insert_time_str, RFC3339)
+
+    # Convert the time to UTC
+    insert_time_utc = insert_time + datetime.timedelta(hours=-1 * hours_offset)
+
+    age = datetime.datetime.utcnow()- insert_time_utc
+
+    if age > datetime.timedelta(hours=args.max_age_hours):
+      command = [args.delete_script, args.project, name]
+      cwd = None
+      # If we download the manifests first delete_deployment will issue
+      # an update before the delete which can help do a clean delete.
+      # But that has the disadvantage of creating GKE clusters if they have
+      # already been deleted; which is slow and wasteful.
+      if args.update_first:
+        # Download the manifest for this deployment.
+        # We want to do an update and then a delete because this is necessary
+        # for deleting role bindings.
+        manifest_url = d["manifest"]
+        manifest_name = manifest_url.split("/")[-1]
+        manifest = manifests_client.get(
+          project=args.project, deployment=name, manifest=manifest_name).execute()
+
+        # Create a temporary directory to store the deployment.
+        manifest_dir = tempfile.mkdtemp(prefix="tmp" + name)
+        logging.info("Creating directory %s to store manifests for deployment %s",
+                     manifest_dir, name)
+        with open(os.path.join(manifest_dir, "cluster-kubeflow.yaml"), "w") as hf:
+          hf.write(manifest["config"]["content"])
+
+        for i in manifest["imports"]:
+          with open(os.path.join(manifest_dir, i["name"]), "w") as hf:
+            hf.write(i["content"])
+
+        command.append("cluster-kubeflow.yaml")
+        cwd = manifest_dir
+      logging.info("Deleting deployment %s; inserted at %s", name,
+                   full_insert_time)
+
+      # We could potentially run the deletes in parallel but that would lead
+      # to very confusing logs.
+      subprocess.check_call(command, cwd=cwd)
+
+  gke = discovery.build("container", "v1", credentials=credentials)
+
+  clusters_client = gke.projects().zones().clusters()
+
+  for zone in args.zones.split(","):
+    clusters = clusters_client.list(projectId=args.project, zone=zone).execute()
+
+    for c in clusters["clusters"]:
+      name = c["name"]
+      if not is_match(name):
+        logging.info("Skipping cluster%s; it does not match expected names.",
+                     name)
+        continue
+
+      full_insert_time = c["createTime"]
+      insert_time_str = full_insert_time[:-6]
+      tz_offset = full_insert_time[-6:]
+      hours_offset = int(tz_offset.split(":", 1)[0])
+      RFC3339 = "%Y-%m-%dT%H:%M:%S"
+      insert_time = datetime.datetime.strptime(insert_time_str, RFC3339)
+
+      # Convert the time to UTC
+      insert_time_utc = insert_time + datetime.timedelta(hours=-1 * hours_offset)
+      age = datetime.datetime.utcnow()- insert_time_utc
+
+      if age > datetime.timedelta(hours=args.max_age_hours):
+        logging.info("Deleting cluster %s in zone %s", name, zone)
+
+        clusters_client.delete(projectId=args.project, zone=zone,
+                               clusterId=name).execute()
+
+if __name__ == "__main__":
+  main()