From 156497b253970e8c8620504462ababb78f0b8a9e Mon Sep 17 00:00:00 2001
From: Jeremy Lewi <jlewi@google.com>
Date: Wed, 30 Oct 2019 18:22:29 -0700
Subject: [PATCH] cleanup_ci needs to use different expiration times for auto
 deployed and E2E clusters

* The auto deployed clusters are now using unique names rather than being
  recycled and we rely on cleanup_ci.py to GC old auto-deployments (#444)

* To support this we need to use variable expiration times.

  * Deployments created by tests should expire as soon as the test is done (so 1-2 hours)

  * But auto-deployed clusters need to live longer

    * There are only refreshed periodically by a cron job (~8 hours) we don't
      want to delete the cluster before a new one is deployed because we need
      a cluster for the example tests

    * We want to leave the clusters up as long as we can to facilitate debugging
      by people working on example tests.

Related to #444
---
 py/kubeflow/testing/cleanup_ci.py             | 204 +++++++++++-------
 py/kubeflow/testing/cleanup_ci_test.py        |  23 ++
 .../cleanup-kubeflow-ci-deployment.yaml       |   6 +-
 3 files changed, 151 insertions(+), 82 deletions(-)
 create mode 100644 py/kubeflow/testing/cleanup_ci_test.py

diff --git a/py/kubeflow/testing/cleanup_ci.py b/py/kubeflow/testing/cleanup_ci.py
index 25cf08c04..8ffd519dc 100644
--- a/py/kubeflow/testing/cleanup_ci.py
+++ b/py/kubeflow/testing/cleanup_ci.py
@@ -20,34 +20,47 @@
 # See https://github.com/kubeflow/testing/issues/444
 # We are switching to unique names for auto deployments
 # So this matches the new ones.
-AUTO_DEPLOY_PATTERN = re.compile(r"kf-vmaster-(?!n\d\d)")
-
-# Regexes that select matching deployments
-MATCHING = [re.compile("e2e-.*"), re.compile("kfctl.*"),
-            re.compile("z-.*"), re.compile(".*presubmit.*"),
-            AUTO_DEPLOY_PATTERN]
-
-MATCHING_FIREWALL_RULES = [re.compile("gke-kfctl-.*"),
-                           re.compile("gke-e2e-.*"),
-                           re.compile(".*presubmit.*"),
-                           re.compile(".*postsubmit.*"),
-                           AUTO_DEPLOY_PATTERN]
-
-# Regexes that select matching disks
-MATCHING_DISK = [re.compile(".*jlewi.*"), re.compile(".*kfctl.*"),
-                 re.compile(".*postsubmit.*"), re.compile(".*presubmit.*"),
-                 AUTO_DEPLOY_PATTERN]
-
-def is_match_disk(name):
-  for m in MATCHING_DISK:
-    if m.match(name):
-      return True
-
-  return False
+AUTO_DEPLOY_PATTERNS = [re.compile(r"kf-vmaster-(?!n\d\d)")]
+
+E2E_PATTERNS = [re.compile(".*e2e-.*"), re.compile(".*kfctl.*"),
+                re.compile(".*z-.*"), re.compile(".*presubmit.*")]
+
+# Constants enumerating the different classes of infra
+# We currently have 2 types
+# Deployments created by E2E tests
+# Auto deployments from master and release branches
+AUTO_INFRA = "auto-deploy"
+E2E_INFRA = "e2e"
+
+# We use E2E OWNERLESS for resources that we currently don't know how
+# to map back to a particular kubeflow deployment.
+# In this case we can't determine whether its an E2E deployment or auto-deployed
+# resource. So we we just associae a long max age with this.
+E2E_OWNERLESS = "e2e-ownerless"
+
+# Map the different classes of infra to max lifetimes.
+MAX_LIFETIME = {
+  # Auto infra should live for 8-24 hours to facilitate debugging of examples
+  AUTO_INFRA: datetime.timedelta(days=2),
+  # E2E infra should be relatively short lived
+  E2E_INFRA: datetime.timedelta(hours=3),
+  # This should be larger than Auto_infra
+  E2E_OWNERLESS: datetime.timedelta(days=2),
+}
+
+def name_to_infra_type(name):
+  """Given a name map it to the type of infrastructure."""
+
+  if is_match(name, patterns=AUTO_DEPLOY_PATTERNS):
+    return AUTO_INFRA
+
+  if is_match(name, patterns=E2E_PATTERNS):
+    return E2E_INFRA
+  return None
 
 def is_match(name, patterns=None):
   if not patterns:
-    patterns = MATCHING
+    patterns = E2E_PATTERNS
   for m in patterns:
     if m.match(name):
       return True
@@ -127,12 +140,19 @@ def cleanup_endpoints(args):
 
     for s in results["services"]:
       name = s["serviceName"]
-      if not is_match(name):
+      infra_type = name_to_infra_type(name)
+
+      if not infra_type:
+        logging.info("Skipping endpoint %s; it does not match any infra type.",
+                     name)
         unmatched.append(name)
         continue
 
+      logging.info("Endpoint %s categorized as %s", name, infra_type)
+
       all_rollouts = rollouts.list(serviceName=name).execute()
       is_expired = False
+      max_age = MAX_LIFETIME[infra_type]
       if not all_rollouts.get("rollouts", []):
         logging.info("Service %s has no rollouts", name)
         is_expired = True
@@ -143,7 +163,7 @@ def cleanup_endpoints(args):
         now = datetime.datetime.now(create_time.tzinfo)
 
         age = now - create_time
-        if age > datetime.timedelta(hours=args.max_age_hours):
+        if age > max_age:
           is_expired = True
 
       if is_expired:
@@ -187,12 +207,20 @@ def cleanup_disks(args):
         break
       for d in results["items"]:
         name = d["name"]
-        if not is_match_disk(name):
+
+        infra_type = name_to_infra_type(name)
+
+        if not infra_type:
+          logging.info("Skipping disk %s; it does not match any infra type.",
+                       name)
           unmatched.append(name)
           continue
 
+        logging.info("Disk %s categorized as %s", name, infra_type)
+
+        max_age = MAX_LIFETIME[infra_type]
         age = getAge(d["creationTimestamp"])
-        if age > datetime.timedelta(hours=args.max_age_hours):
+        if age > max_age:
           logging.info("Deleting disk: %s, age = %r", name, age)
           if not args.dryrun:
             response = disks.delete(project=args.project, zone=zone,
@@ -231,21 +259,22 @@ def cleanup_firewall_rules(args):
     for d in results["items"]:
       name = d["name"]
 
-      match = False
-      if is_match(name, patterns=MATCHING_FIREWALL_RULES):
-        match = True
+      infra_type = name_to_infra_type(name)
 
       for tag in d.get("targetTags", []):
-        if is_match(tag, patterns=MATCHING_FIREWALL_RULES):
-          match = True
-          break
+        tag_infra_type = name_to_infra_type(tag)
+        if tag_infra_type:
+          infra_type = tag_infra_type
 
-      if not match:
+      if not infra_type:
         unmatched.append(name)
         continue
 
+      logging.info("Firewall rule %s classified as infra type %s", name,
+                   infra_type)
+      max_age = MAX_LIFETIME[infra_type]
       age = getAge(d["creationTimestamp"])
-      if age > datetime.timedelta(hours=args.max_age_hours):
+      if age > max_age:
         logging.info("Deleting firewall: %s, age = %r", name, age)
         if not args.dryrun:
           response = firewalls.delete(project=args.project,
@@ -274,6 +303,9 @@ def cleanup_instance_groups(args):
   unexpired = []
   in_use = []
 
+  # TODO(jlewi): We should check whether the instance group is in use
+  # before deleting it. At least in pantheon it looks like instance groups
+  # are listed as in use by clusters.
   for zone in args.zones.split(","): # pylint: disable=too-many-nested-blocks
     while True:
       results = instanceGroups.list(project=args.project,
@@ -284,8 +316,17 @@ def cleanup_instance_groups(args):
       for s in results["items"]:
         name = s["name"]
         age = getAge(s["creationTimestamp"])
-        if age > datetime.timedelta(
-          hours=args.max_ci_deployment_resource_age_hours):
+
+        infra_type = name_to_infra_type(name)
+
+        if not infra_type:
+          logging.info("Skipping intance group %s; it does not match any "
+                       "infra type.", name)
+          continue
+
+        logging.info("Instance group %s categorized as %s", name, infra_type)
+
+        if age > MAX_LIFETIME[infra_type]:
           logging.info("Deleting instanceGroups: %s, age = %r", name, age)
           if not args.dryrun:
             try:
@@ -328,8 +369,7 @@ def cleanup_url_maps(args):
     for s in results["items"]:
       name = s["name"]
       age = getAge(s["creationTimestamp"])
-      if age > datetime.timedelta(
-        hours=args.max_ci_deployment_resource_age_hours):
+      if age > MAX_LIFETIME[E2E_OWNERLESS]:
         logging.info("Deleting urlMaps: %s, age = %r", name, age)
         if not args.dryrun:
           try:
@@ -372,7 +412,7 @@ def cleanup_target_https_proxies(args):
     for s in results["items"]:
       name = s["name"]
       age = getAge(s["creationTimestamp"])
-      if age > datetime.timedelta(hours=args.max_proxy_age_hours):
+      if age > MAX_LIFETIME[E2E_OWNERLESS]:
         logging.info("Deleting urlMaps: %s, age = %r", name, age)
         if not args.dryrun:
           try:
@@ -417,8 +457,7 @@ def cleanup_target_http_proxies(args):
     for s in results["items"]:
       name = s["name"]
       age = getAge(s["creationTimestamp"])
-      if age > datetime.timedelta(
-        hours=args.max_ci_deployment_resource_age_hours):
+      if age > MAX_LIFETIME[E2E_OWNERLESS]:
         logging.info("Deleting urlMaps: %s, age = %r", name, age)
         if not args.dryrun:
           try:
@@ -462,7 +501,7 @@ def cleanup_forwarding_rules(args):
     for s in results["items"]:
       name = s["name"]
       age = getAge(s["creationTimestamp"])
-      if age > datetime.timedelta(hours=args.max_proxy_age_hours):
+      if age > MAX_LIFETIME[E2E_OWNERLESS]:
         logging.info("Deleting forwarding rule: %s, age = %r", name, age)
         if not args.dryrun:
           try:
@@ -506,8 +545,7 @@ def cleanup_backend_services(args):
     for s in results["items"]:
       name = s["name"]
       age = getAge(s["creationTimestamp"])
-      if age > datetime.timedelta(
-        hours=args.max_ci_deployment_resource_age_hours):
+      if age > MAX_LIFETIME[E2E_OWNERLESS]:
         logging.info("Deleting backend services: %s, age = %r", name, age)
         if not args.dryrun:
           try:
@@ -623,14 +661,18 @@ def cleanup_certificates(args):
 
       name = d["name"]
       domain = get_ssl_certificate_domain(d)
-      # Expire e2e certs after 4 hours
-      if domain.startswith("kfct"):
-        max_age = datetime.timedelta(hours=4)
-      else:
-        # For autodeployments delete after seven days
-        max_age = datetime.timedelta(days=7)
 
-      if age > max_age:
+      infra_type = name_to_infra_type(domain)
+
+      if not infra_type:
+        logging.info("Skipping certificate named %s for domain %s; "
+                     "it does not match any infra type.", name, domain)
+        continue
+
+      logging.info("Certificate named %s for domain %s categorized as %s",
+                   name, domain, infra_type)
+
+      if age > MAX_LIFETIME[infra_type]:
         logging.info("Deleting certifcate: %s for domain %s", d["name"], domain)
         is_expired = True
         if not args.dryrun:
@@ -679,13 +721,16 @@ def cleanup_service_accounts(args):
   # Service accounts don't specify the creation date time. So we
   # use the creation time of the key associated with the account.
   for a in accounts:
-    if not is_match(a["email"]):
-      logging.info("Skipping key %s; it does not match expected names.",
-                   a["email"])
+    infra_type = name_to_infra_type(a["email"])
 
+    if not infra_type:
+      logging.info("Skipping service account %s; it does not match any "
+                   "infra type.", a["email"])
       unmatched_emails.append(a["email"])
       continue
 
+    logging.info("Service account %s categorized as %s", a["email"], infra_type)
+
     keys = keys_client.list(name=a["name"]).execute()
 
     is_expired = True
@@ -694,7 +739,7 @@ def cleanup_service_accounts(args):
       now = datetime.datetime.now(valid_time.tzinfo)
 
       age = now - valid_time
-      if age < datetime.timedelta(hours=args.max_age_hours):
+      if age < MAX_LIFETIME[a["email"]]:
         is_expired = False
         break
     if is_expired:
@@ -824,11 +869,15 @@ def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-br
 
     name = d["name"]
 
-    if not is_match(name):
-      logging.info("Skipping Deployment %s; it does not match expected names.",
+    infra_type = name_to_infra_type(name)
+
+    if not infra_type:
+      logging.info("Skipping Deployment %s; it does not match any infra type.",
                    name)
       continue
 
+    logging.info("Deployment %s categorized as %s", name, infra_type)
+
     full_insert_time = d.get("insertTime")
     age = getAge(full_insert_time)
 
@@ -838,14 +887,14 @@ def cleanup_deployments(args): # pylint: disable=too-many-statements,too-many-br
                    d.get("name"), d.get("operation").get("error"))
       max_age = datetime.timedelta(minutes=10)
     else:
-      max_age = datetime.timedelta(hours=args.max_age_hours)
+      max_age = MAX_LIFETIME[infra_type]
 
     if age < max_age:
       unexpired.append(name)
-      logging.info("Deployment %s has not expired", name)
+      logging.info("Deployment %s has not expired; max age %s", name, max_age)
       continue
 
-    logging.info("Deployment %s has expired", name)
+    logging.info("Deployment %s has expired; max_age %s", name, max_age)
     expired.append(name)
     logging.info("Deleting deployment %s", name)
 
@@ -884,11 +933,16 @@ def cleanup_clusters(args):
       continue
     for c in clusters["clusters"]:
       name = c["name"]
-      if not is_match(name):
-        logging.info("Skipping cluster%s; it does not match expected names.",
+
+      infra_type = name_to_infra_type(name)
+
+      if not infra_type:
+        logging.info("Skipping cluster %s; it does not match any infra type.",
                      name)
         continue
 
+      logging.info("Deployment %s categorized as %s", name, infra_type)
+
       full_insert_time = c["createTime"]
       insert_time_str = full_insert_time[:-6]
       tz_offset = full_insert_time[-6:]
@@ -906,8 +960,7 @@ def cleanup_clusters(args):
         logging.info("Cluster %s is in error state; %s", c["name"], c.get("statusMessage", ""))
         max_age = datetime.timedelta(minutes=10)
       else:
-        max_age = datetime.timedelta(hours=args.max_age_hours)
-
+        max_age = MAX_LIFETIME[infra_type]
 
       if age > max_age:
         if c.get("status", "") == "STOPPING":
@@ -1015,17 +1068,6 @@ def main():
     help=("""Whether to GC backend services that are older
           than --max_ci_deployment_resource_age_hours."""))
 
-  parser.add_argument(
-    "--max_ci_deployment_resource_age_hours",
-    default=24, type=int,
-    help=("The age of resources in kubeflow-ci-deployment to gc."))
-
-  parser.add_argument(
-    "--max_proxy_age_hours",
-    default=4, type=int,
-    help=("""The age of forwarding rules, proxy and SSL certificate in
-          kubeflow-ci-deployment to gc."""))
-
   parser.add_argument(
     "--max_wf_age_hours", default=7*24, type=int,
     help=("How long to wait before garbage collecting Argo workflows."))
@@ -1116,9 +1158,13 @@ def main():
 
   parser_clusters.set_defaults(func=cleanup_clusters)
 
-
   args = parser.parse_args()
 
+  # Update max age
+  MAX_LIFETIME[E2E_INFRA] = datetime.timedelta(hours=args.max_age_hours)
+
+  logging.info("Max lifetime:\n%s", MAX_LIFETIME)
+
   util.maybe_activate_service_account()
 
   args.func(args)
diff --git a/py/kubeflow/testing/cleanup_ci_test.py b/py/kubeflow/testing/cleanup_ci_test.py
new file mode 100644
index 000000000..0eed537ad
--- /dev/null
+++ b/py/kubeflow/testing/cleanup_ci_test.py
@@ -0,0 +1,23 @@
+from kubeflow.testing import cleanup_ci
+import logging
+import pytest
+
+def test_match_endpoints():
+  """Verify that cloud endpoint service names match the regex"""
+
+  service_names = [
+    "iap-ingress-kfctl-8c9b.endpoints.kubeflow-ci-deployment.cloud.goog",
+  ]
+
+  for s in service_names:
+    assert cleanup_ci.is_match(s, patterns=cleanup_ci.MATCHING)
+
+if __name__ == "__main__":
+  logging.basicConfig(
+      level=logging.INFO,
+      format=('%(levelname)s|%(asctime)s'
+            '|%(pathname)s|%(lineno)d| %(message)s'),
+    datefmt='%Y-%m-%dT%H:%M:%S',
+    )
+  logging.getLogger().setLevel(logging.INFO)
+  pytest.main()
diff --git a/test-infra/cleanup-kubeflow-ci-deployment.yaml b/test-infra/cleanup-kubeflow-ci-deployment.yaml
index cebc4583d..49d22ba93 100644
--- a/test-infra/cleanup-kubeflow-ci-deployment.yaml
+++ b/test-infra/cleanup-kubeflow-ci-deployment.yaml
@@ -23,9 +23,9 @@ spec:
       - command:
         - /bin/sh
         - -xc
-        # Stop using PR #481 once its subbmitted
-        - /usr/local/bin/checkout_repos.sh --repos=kubeflow/kubeflow@HEAD,kubeflow/testing@HEAD:486 --src_dir=/src; python -m kubeflow.testing.cleanup_ci --project=kubeflow-ci-deployment
-          --gc_backend_services=true all --delete_script=/src/kubeflow/kubeflow/scripts/gke/delete_deployment.sh
+        # TODO(jlewi): Switch to using init container to check things out
+        - /usr/local/bin/checkout_repos.sh --repos=kubeflow/kubeflow@HEAD,kubeflow/testing@HEAD:512 --src_dir=/src; python -m kubeflow.testing.cleanup_ci --project=kubeflow-ci-deployment
+          --gc_backend_services=true --dryrun all 
         env:
         - name: PYTHONPATH
           value: /src/kubeflow/testing/py