From 63220ca781c3ed6faeb55b05b0145f8db1e96ce9 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Tue, 28 May 2019 16:02:49 -0500
Subject: [PATCH] chore(authz-migration): add migration script

---
 bin/migrate_acl_authz.py | 191 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 bin/migrate_acl_authz.py

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
new file mode 100644
index 00000000..38fef876
--- /dev/null
+++ b/bin/migrate_acl_authz.py
@@ -0,0 +1,191 @@
+"""
+This script is used to migrate the `acl` field in indexd to the new `authz` field which
+will be used in combination with arborist to handle access control on indexd records.
+
+The `authz` field should consist of a list of resource tags (as defined by
+arborist---see arborist's readme at https://github.com/uc-cdis/arborist for more info),
+with the meaning that a user trying to access the data file pointed to by this record
+must have access to all the resources listed. These resources may be projects or consent
+codes or some other mechanism for specifying authorization.
+
+In terms of the migration, it isn't discernable from indexd itself whether the items
+listed in the `acl` are programs or projects. For this reason we need access to the
+sheepdog tables with "core data"/"metadata", so we can look up which is which. Then, if
+the record previously had both a program and a project, since the authz field requires
+access to all the listed items, only the project should end up in `authz` (since
+requiring the program would omit access to users who can access only the project).
+
+Furthermore, there are two ways to represent the arborist resources that go into
+`authz`: the path (human-readable string) and the tag (random string, pseudo-UUID). The
+tags are what we want to ultimately put into the `authz` field, since these are
+persistent whereas the path could change if resources are renamed.
+"""
+
+import argparse
+import sys
+
+from cdislogging import get_logger
+import requests
+from sqlalchemy.engine import create_engine
+from sqlalchemy.exc import OperationalError
+
+from indexd.index.drivers.alchemy import IndexRecord, IndexRecordAuthz
+
+
+logger = get_logger("migrate_acl_authz")
+
+
+def main():
+    args = parse_args()
+    sys.path.append(args.path)
+    try:
+        from local_settings import settings
+    except ImportError:
+        logger.info("Can't import local_settings, import from default")
+        from indexd.default_settings import settings
+    driver = settings["config"]["INDEX"]["driver"]
+    try:
+        acl_converter = ACLConverter(args.sheepdog, args.arborist)
+    except EnvironmentError:
+        logger.error("can't continue without database connection")
+        sys.exit(1)
+    with driver.session as session:
+        records = session.query(IndexRecord)
+        for record in records:
+            if not record.acl:
+                logger.info(
+                    "record {} has no acl, setting authz to empty"
+                    .format(record.did)
+                )
+                record.authz = []
+                continue
+            try:
+                record.authz = acl_converter.acl_to_authz(record)
+                session.add(record)
+                logger.info(
+                    "updated authz for {} to {}"
+                    .format(record.did, record.authz.resource)
+                )
+            except EnvironmentError as e:
+                msg = "problem adding authz for record {}: {}".format(record.did, e)
+                logger.error(msg)
+    logger.info("finished migrating")
+    return
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--path", default="/var/www/indexd/", help="path to find local_settings.py",
+    )
+    parser.add_argument(
+        "--sheepdog-db", dest="sheepdog", help="URI for the sheepdog database"
+    )
+    parser.add_argument(
+        "--arborist-url", dest="arborist", help="URL for the arborist service"
+    )
+    return parser.parse_args()
+
+
+class ACLConverter(object):
+    def __init__(self, sheepdog_db, arborist_url):
+        self.arborist_url = arborist_url.rstrip("/")
+        self.programs = set()
+        self.projects = dict()
+        # map resource paths to tags in arborist so we can save http calls
+        self.arborist_resources = dict()
+
+        engine = create_engine(sheepdog_db, echo=False)
+        try:
+            connection = engine.connect()
+        except OperationalError:
+            raise EnvironmentError(
+                "couldn't connect to sheepdog db using the provided URI"
+            )
+
+        result = connection.execute("SELECT _props->>'name' as name from node_program;")
+        for row in result:
+            self.programs.add(row["name"])
+
+        result = connection.execute("""
+            SELECT
+                project._props->>'name' AS name,
+                program._props->>'name' AS program
+            FROM node_project AS project
+            JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id
+            JOIN node_program AS program ON edge.dst_id = program.node_id;
+        """)
+        for row in result:
+            self.projects[row["name"]] = row["program"]
+
+        connection.close()
+        return
+
+    def is_program(self, acl_item):
+        return acl_item in self.programs
+
+    def acl_to_authz(self, record):
+        path = None
+        for acl_object in record.acl:
+            acl_item = acl_object.ace
+            if acl_item == "*":
+                path = "/open"
+            elif not path and self.is_program(acl_item):
+                path = "/programs/{}".format(acl_item)
+            else:
+                if acl_item not in self.projects:
+                    raise EnvironmentError(
+                        "program or project {} does not exist".format(acl_item)
+                    )
+                path = "/programs/{}/projects/{}".format(
+                    acl_item, self.projects[acl_item]
+                )
+
+        if not path:
+            logger.error(
+                "couldn't get `authz` for record {} from {}; setting as empty"
+                .format(record.did, record.acl)
+            )
+            return []
+
+        if path not in self.arborist_resources:
+            url = "{}/resource/".format(self.arborist_url)
+            failed = False
+            try:
+                resource = {"path": path}
+                response = requests.post(url, timeout=5, json=resource)
+            except requests.exceptions.Timeout:
+                logger.error(
+                    "couldn't hit arborist to look up resource (timed out): {}".format(url)
+                )
+                failed = True
+            tag = None
+            try:
+                if response.status_code == 409:
+                    # resource is already there, so we'll just take the tag
+                    tag = response.json()["tag"]
+                elif response.status_code != 201:
+                    logger.error(
+                        "couldn't hit arborist at {} to create resource (got {}): {}".format(
+                            url, response.status_code, response.json()
+                        )
+                    )
+                    failed = True
+                else:
+                    # just created the resource for the first time
+                    tag = response.json()["created"]["tag"]
+            except (ValueError, KeyError) as e:
+                raise EnvironmentError(
+                    "couldn't understand response from arborist: {}".format(e)
+                )
+
+            if failed or not tag:
+                raise EnvironmentError("couldn't reach arborist")
+            self.arborist_resources[path] = tag
+
+        tag = self.arborist_resources[path]
+        return [IndexRecordAuthz(did=record.did, resource=tag)]
+
+
+if __name__ == "__main__":
+    main()