From 63220ca781c3ed6faeb55b05b0145f8db1e96ce9 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Tue, 28 May 2019 16:02:49 -0500
Subject: [PATCH 01/13] chore(authz-migration): add migration script

---
 bin/migrate_acl_authz.py | 191 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 bin/migrate_acl_authz.py

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
new file mode 100644
index 00000000..38fef876
--- /dev/null
+++ b/bin/migrate_acl_authz.py
@@ -0,0 +1,191 @@
+"""
+This script is used to migrate the `acl` field in indexd to the new `authz` field which
+will be used in combination with arborist to handle access control on indexd records.
+
+The `authz` field should consist of a list of resource tags (as defined by
+arborist---see arborist's readme at https://github.com/uc-cdis/arborist for more info),
+with the meaning that a user trying to access the data file pointed to by this record
+must have access to all the resources listed. These resources may be projects or consent
+codes or some other mechanism for specifying authorization.
+
+In terms of the migration, it isn't discernable from indexd itself whether the items
+listed in the `acl` are programs or projects. For this reason we need access to the
+sheepdog tables with "core data"/"metadata", so we can look up which is which. Then, if
+the record previously had both a program and a project, since the authz field requires
+access to all the listed items, only the project should end up in `authz` (since
+requiring the program would omit access to users who can access only the project).
+
+Furthermore, there are two ways to represent the arborist resources that go into
+`authz`: the path (human-readable string) and the tag (random string, pseudo-UUID). The
+tags are what we want to ultimately put into the `authz` field, since these are
+persistent whereas the path could change if resources are renamed.
+"""
+
+import argparse
+import sys
+
+from cdislogging import get_logger
+import requests
+from sqlalchemy.engine import create_engine
+from sqlalchemy.exc import OperationalError
+
+from indexd.index.drivers.alchemy import IndexRecord, IndexRecordAuthz
+
+
+logger = get_logger("migrate_acl_authz")
+
+
+def main():
+    args = parse_args()
+    sys.path.append(args.path)
+    try:
+        from local_settings import settings
+    except ImportError:
+        logger.info("Can't import local_settings, import from default")
+        from indexd.default_settings import settings
+    driver = settings["config"]["INDEX"]["driver"]
+    try:
+        acl_converter = ACLConverter(args.sheepdog, args.arborist)
+    except EnvironmentError:
+        logger.error("can't continue without database connection")
+        sys.exit(1)
+    with driver.session as session:
+        records = session.query(IndexRecord)
+        for record in records:
+            if not record.acl:
+                logger.info(
+                    "record {} has no acl, setting authz to empty"
+                    .format(record.did)
+                )
+                record.authz = []
+                continue
+            try:
+                record.authz = acl_converter.acl_to_authz(record)
+                session.add(record)
+                logger.info(
+                    "updated authz for {} to {}"
+                    .format(record.did, record.authz.resource)
+                )
+            except EnvironmentError as e:
+                msg = "problem adding authz for record {}: {}".format(record.did, e)
+                logger.error(msg)
+    logger.info("finished migrating")
+    return
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--path", default="/var/www/indexd/", help="path to find local_settings.py",
+    )
+    parser.add_argument(
+        "--sheepdog-db", dest="sheepdog", help="URI for the sheepdog database"
+    )
+    parser.add_argument(
+        "--arborist-url", dest="arborist", help="URL for the arborist service"
+    )
+    return parser.parse_args()
+
+
+class ACLConverter(object):
+    def __init__(self, sheepdog_db, arborist_url):
+        self.arborist_url = arborist_url.rstrip("/")
+        self.programs = set()
+        self.projects = dict()
+        # map resource paths to tags in arborist so we can save http calls
+        self.arborist_resources = dict()
+
+        engine = create_engine(sheepdog_db, echo=False)
+        try:
+            connection = engine.connect()
+        except OperationalError:
+            raise EnvironmentError(
+                "couldn't connect to sheepdog db using the provided URI"
+            )
+
+        result = connection.execute("SELECT _props->>'name' as name from node_program;")
+        for row in result:
+            self.programs.add(row["name"])
+
+        result = connection.execute("""
+            SELECT
+                project._props->>'name' AS name,
+                program._props->>'name' AS program
+            FROM node_project AS project
+            JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id
+            JOIN node_program AS program ON edge.dst_id = program.node_id;
+        """)
+        for row in result:
+            self.projects[row["name"]] = row["program"]
+
+        connection.close()
+        return
+
+    def is_program(self, acl_item):
+        return acl_item in self.programs
+
+    def acl_to_authz(self, record):
+        path = None
+        for acl_object in record.acl:
+            acl_item = acl_object.ace
+            if acl_item == "*":
+                path = "/open"
+            elif not path and self.is_program(acl_item):
+                path = "/programs/{}".format(acl_item)
+            else:
+                if acl_item not in self.projects:
+                    raise EnvironmentError(
+                        "program or project {} does not exist".format(acl_item)
+                    )
+                path = "/programs/{}/projects/{}".format(
+                    acl_item, self.projects[acl_item]
+                )
+
+        if not path:
+            logger.error(
+                "couldn't get `authz` for record {} from {}; setting as empty"
+                .format(record.did, record.acl)
+            )
+            return []
+
+        if path not in self.arborist_resources:
+            url = "{}/resource/".format(self.arborist_url)
+            failed = False
+            try:
+                resource = {"path": path}
+                response = requests.post(url, timeout=5, json=resource)
+            except requests.exceptions.Timeout:
+                logger.error(
+                    "couldn't hit arborist to look up resource (timed out): {}".format(url)
+                )
+                failed = True
+            tag = None
+            try:
+                if response.status_code == 409:
+                    # resource is already there, so we'll just take the tag
+                    tag = response.json()["tag"]
+                elif response.status_code != 201:
+                    logger.error(
+                        "couldn't hit arborist at {} to create resource (got {}): {}".format(
+                            url, response.status_code, response.json()
+                        )
+                    )
+                    failed = True
+                else:
+                    # just created the resource for the first time
+                    tag = response.json()["created"]["tag"]
+            except (ValueError, KeyError) as e:
+                raise EnvironmentError(
+                    "couldn't understand response from arborist: {}".format(e)
+                )
+
+            if failed or not tag:
+                raise EnvironmentError("couldn't reach arborist")
+            self.arborist_resources[path] = tag
+
+        tag = self.arborist_resources[path]
+        return [IndexRecordAuthz(did=record.did, resource=tag)]
+
+
+if __name__ == "__main__":
+    main()

From e30df68c82f727465f40ba1573aad2a72605f485 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Thu, 30 May 2019 14:54:15 -0500
Subject: [PATCH 02/13] chore(authz-migration): code review fixes

---
 bin/migrate_acl_authz.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index 38fef876..8071c53e 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -22,6 +22,8 @@
 """
 
 import argparse
+import os
+import re
 import sys
 
 from cdislogging import get_logger
@@ -92,6 +94,7 @@ def __init__(self, sheepdog_db, arborist_url):
         self.arborist_url = arborist_url.rstrip("/")
         self.programs = set()
         self.projects = dict()
+        self.namespace = os.getenv("AUTH_NAMESPACE", "")
         # map resource paths to tags in arborist so we can save http calls
         self.arborist_resources = dict()
 
@@ -128,6 +131,10 @@ def acl_to_authz(self, record):
         path = None
         for acl_object in record.acl:
             acl_item = acl_object.ace
+            # we'll try to do some sanitizing here since the record ACLs are sometimes
+            # really mis-formatted, like `["u'phs000123'"]`
+            acl_item = acl_item.lstrip("u'")
+            acl_item = re.sub(r"\W+", "", acl_item)
             if acl_item == "*":
                 path = "/open"
             elif not path and self.is_program(acl_item):
@@ -148,9 +155,11 @@ def acl_to_authz(self, record):
             )
             return []
 
+        if self.namespace:
+            path = "{}/{}".format(self.namespace, path)
+
         if path not in self.arborist_resources:
             url = "{}/resource/".format(self.arborist_url)
-            failed = False
             try:
                 resource = {"path": path}
                 response = requests.post(url, timeout=5, json=resource)
@@ -158,19 +167,19 @@ def acl_to_authz(self, record):
                 logger.error(
                     "couldn't hit arborist to look up resource (timed out): {}".format(url)
                 )
-                failed = True
+                raise EnvironmentError("couldn't reach arborist; request timed out")
             tag = None
             try:
                 if response.status_code == 409:
                     # resource is already there, so we'll just take the tag
-                    tag = response.json()["tag"]
+                    tag = response.json()["exists"]["tag"]
                 elif response.status_code != 201:
                     logger.error(
                         "couldn't hit arborist at {} to create resource (got {}): {}".format(
                             url, response.status_code, response.json()
                         )
                     )
-                    failed = True
+                    raise EnvironmentError("got unexpected response from arborist")
                 else:
                     # just created the resource for the first time
                     tag = response.json()["created"]["tag"]
@@ -178,8 +187,7 @@ def acl_to_authz(self, record):
                 raise EnvironmentError(
                     "couldn't understand response from arborist: {}".format(e)
                 )
-
-            if failed or not tag:
+            if not tag:
                 raise EnvironmentError("couldn't reach arborist")
             self.arborist_resources[path] = tag
 

From 330d470a47dd70acf50fbcff24b70a016271018d Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Thu, 30 May 2019 16:44:48 -0500
Subject: [PATCH 03/13] chore(authz-migration): try windowed approach

---
 bin/migrate_acl_authz.py | 46 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index 8071c53e..0c23667e 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -28,6 +28,8 @@
 
 from cdislogging import get_logger
 import requests
+import sqlalchemy
+from sqlalchemy import and_, func
 from sqlalchemy.engine import create_engine
 from sqlalchemy.exc import OperationalError
 
@@ -51,9 +53,10 @@ def main():
     except EnvironmentError:
         logger.error("can't continue without database connection")
         sys.exit(1)
+
     with driver.session as session:
         records = session.query(IndexRecord)
-        for record in records:
+        for record in windowed_query(records, IndexRecord.did, args.chunk_size):
             if not record.acl:
                 logger.info(
                     "record {} has no acl, setting authz to empty"
@@ -86,6 +89,13 @@ def parse_args():
     parser.add_argument(
         "--arborist-url", dest="arborist", help="URL for the arborist service"
     )
+    parser.add_argument(
+        "--chunk-size", dest="chunk_size", help="number of records to process at once",
+    )
+    parser.add_argument(
+        "--start-did", dest="start_did",
+        help="did to start at (records processed in lexographical order)",
+    )
     return parser.parse_args()
 
 
@@ -195,5 +205,39 @@ def acl_to_authz(self, record):
         return [IndexRecordAuthz(did=record.did, resource=tag)]
 
 
+def column_windows(session, column, windowsize):
+
+    def int_for_range(start_id, end_id):
+        if end_id:
+            return and_(column >= start_id, column < end_id)
+        else:
+            return column >= start_id
+
+    q = (
+        session
+        .query(column, func.row_number().over(order_by=column).label('rownum'))
+        .from_self(column)
+    )
+    if windowsize > 1:
+        q = q.filter(sqlalchemy.text("rownum %% %d=1" % windowsize))
+
+    intervals = [id for id, in q]
+
+    while intervals:
+        start = intervals.pop(0)
+        if intervals:
+            end = intervals[0]
+        else:
+            end = None
+        yield int_for_range(start, end)
+        logger.info("doing a commit now")
+
+
+def windowed_query(q, column, windowsize):
+    for whereclause in column_windows(q.session, column, windowsize):
+        for row in q.filter(whereclause).order_by(column):
+            yield row
+
+
 if __name__ == "__main__":
     main()

From 39119a824eafd041108889e9fa000f28cafee715 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Thu, 30 May 2019 17:02:48 -0500
Subject: [PATCH 04/13] chore(authz-migration): remove duplicate slash

---
 bin/migrate_acl_authz.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index 0c23667e..a12e7f4d 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -166,7 +166,7 @@ def acl_to_authz(self, record):
             return []
 
         if self.namespace:
-            path = "{}/{}".format(self.namespace, path)
+            path = self.namespace + path
 
         if path not in self.arborist_resources:
             url = "{}/resource/".format(self.arborist_url)

From 28525440874f315c21e3d1e188c9036f607c300d Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Thu, 30 May 2019 17:51:32 -0500
Subject: [PATCH 05/13] chore(authz-migration): reorganize

---
 bin/migrate_acl_authz.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index a12e7f4d..d1102b59 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -65,12 +65,14 @@ def main():
                 record.authz = []
                 continue
             try:
-                record.authz = acl_converter.acl_to_authz(record)
+                authz = acl_converter.acl_to_authz(record)
+                if authz:
+                    record.authz = [IndexRecordAuthz(did=record.did, resource=authz)]
+                    logger.info("updated authz for {} to {}".format(record.did, authz))
+                else:
+                    record.authz = []
+                    logger.info("updated authz for {} to empty list".format(record.did))
                 session.add(record)
-                logger.info(
-                    "updated authz for {} to {}"
-                    .format(record.did, record.authz.resource)
-                )
             except EnvironmentError as e:
                 msg = "problem adding authz for record {}: {}".format(record.did, e)
                 logger.error(msg)
@@ -163,7 +165,7 @@ def acl_to_authz(self, record):
                 "couldn't get `authz` for record {} from {}; setting as empty"
                 .format(record.did, record.acl)
             )
-            return []
+            return None
 
         if self.namespace:
             path = self.namespace + path
@@ -201,8 +203,7 @@ def acl_to_authz(self, record):
                 raise EnvironmentError("couldn't reach arborist")
             self.arborist_resources[path] = tag
 
-        tag = self.arborist_resources[path]
-        return [IndexRecordAuthz(did=record.did, resource=tag)]
+        return self.arborist_resources[path]
 
 
 def column_windows(session, column, windowsize):

From 28df2db27b161bb86b3108d352ec65db10102298 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Mon, 3 Jun 2019 12:49:44 -0500
Subject: [PATCH 06/13] chore(authz-migration): add ?p

---
 bin/migrate_acl_authz.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index d1102b59..4a256d38 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -171,7 +171,8 @@ def acl_to_authz(self, record):
             path = self.namespace + path
 
         if path not in self.arborist_resources:
-            url = "{}/resource/".format(self.arborist_url)
+            # add `?p` to create parent resources as necessary
+            url = "{}/resource/?p".format(self.arborist_url)
             try:
                 resource = {"path": path}
                 response = requests.post(url, timeout=5, json=resource)

From 8b6b744450391d886ba142d1b1e4b43a754a4809 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Mon, 3 Jun 2019 15:27:40 -0500
Subject: [PATCH 07/13] chore(authz-migration): more logs and partial commits

---
 bin/migrate_acl_authz.py | 35 +++++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index 4a256d38..b48b39ce 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -55,8 +55,8 @@ def main():
         sys.exit(1)
 
     with driver.session as session:
-        records = session.query(IndexRecord)
-        for record in windowed_query(records, IndexRecord.did, args.chunk_size):
+        q = session.query(IndexRecord)
+        for record in windowed_query(session, q, IndexRecord.did, int(args.chunk_size)):
             if not record.acl:
                 logger.info(
                     "record {} has no acl, setting authz to empty"
@@ -92,7 +92,8 @@ def parse_args():
         "--arborist-url", dest="arborist", help="URL for the arborist service"
     )
     parser.add_argument(
-        "--chunk-size", dest="chunk_size", help="number of records to process at once",
+        "--chunk-size", dest="chunk_size", type=int, default=1000,
+        help="number of records to process at once",
     )
     parser.add_argument(
         "--start-did", dest="start_did",
@@ -107,6 +108,10 @@ def __init__(self, sheepdog_db, arborist_url):
         self.programs = set()
         self.projects = dict()
         self.namespace = os.getenv("AUTH_NAMESPACE", "")
+        if self.namespace:
+            logger.info("using namespace {}".format(self.namespace))
+        else:
+            logger.info("not using any auth namespace")
         # map resource paths to tags in arborist so we can save http calls
         self.arborist_resources = dict()
 
@@ -132,9 +137,14 @@ def __init__(self, sheepdog_db, arborist_url):
         """)
         for row in result:
             self.projects[row["name"]] = row["program"]
-
         connection.close()
-        return
+
+        logger.info("found programs: {}".format(list(self.programs)))
+        projects_log = [
+            "{} (from program {})".format(project, program)
+            for project, program in self.projects.items()
+        ]
+        logger.info("found projects: [{}]".format(", ".join(projects_log)))
 
     def is_program(self, acl_item):
         return acl_item in self.programs
@@ -144,7 +154,8 @@ def acl_to_authz(self, record):
         for acl_object in record.acl:
             acl_item = acl_object.ace
             # we'll try to do some sanitizing here since the record ACLs are sometimes
-            # really mis-formatted, like `["u'phs000123'"]`
+            # really mis-formatted, like `["u'phs000123'"]`, or have spaces left in
+            acl_item = acl_item.strip(" ")
             acl_item = acl_item.lstrip("u'")
             acl_item = re.sub(r"\W+", "", acl_item)
             if acl_item == "*":
@@ -152,6 +163,8 @@ def acl_to_authz(self, record):
             elif not path and self.is_program(acl_item):
                 path = "/programs/{}".format(acl_item)
             else:
+                if not acl_item:
+                    return None
                 if acl_item not in self.projects:
                     raise EnvironmentError(
                         "program or project {} does not exist".format(acl_item)
@@ -183,6 +196,10 @@ def acl_to_authz(self, record):
                 raise EnvironmentError("couldn't reach arborist; request timed out")
             tag = None
             try:
+                logger.debug(
+                    "got {} from arborist: {}"
+                    .format(response.status_code, response.json())
+                )
                 if response.status_code == 409:
                     # resource is already there, so we'll just take the tag
                     tag = response.json()["exists"]["tag"]
@@ -203,6 +220,7 @@ def acl_to_authz(self, record):
             if not tag:
                 raise EnvironmentError("couldn't reach arborist")
             self.arborist_resources[path] = tag
+            logger.info("using tag {} for path {}".format(tag, path))
 
         return self.arborist_resources[path]
 
@@ -232,13 +250,14 @@ def int_for_range(start_id, end_id):
         else:
             end = None
         yield int_for_range(start, end)
-        logger.info("doing a commit now")
 
 
-def windowed_query(q, column, windowsize):
+def windowed_query(session, q, column, windowsize):
     for whereclause in column_windows(q.session, column, windowsize):
         for row in q.filter(whereclause).order_by(column):
             yield row
+        session.commit()
+        logger.info("committed progress to database")
 
 
 if __name__ == "__main__":

From 6284424e251bd8f4c355ae97684129fe708bfcca Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Wed, 5 Jun 2019 11:31:09 -0500
Subject: [PATCH 08/13] chore(authz-migration): add start-did logic

---
 bin/migrate_acl_authz.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index b48b39ce..e8e6009c 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -56,6 +56,8 @@ def main():
 
     with driver.session as session:
         q = session.query(IndexRecord)
+        if getattr(args, "start_did"):
+            q = q.filter(IndexRecord.did >= args.start_did)
         for record in windowed_query(session, q, IndexRecord.did, int(args.chunk_size)):
             if not record.acl:
                 logger.info(

From 5b0e571de3413b48936e1d956d1b41c6c35d1a08 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Wed, 5 Jun 2019 13:50:26 -0500
Subject: [PATCH 09/13] chore(authz-migration): log pathological cases

---
 bin/migrate_acl_authz.py | 83 +++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index e8e6009c..b4846be8 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -49,7 +49,7 @@ def main():
         from indexd.default_settings import settings
     driver = settings["config"]["INDEX"]["driver"]
     try:
-        acl_converter = ACLConverter(args.sheepdog, args.arborist)
+        acl_converter = ACLConverter(args.arborist, getattr(args, "sheepdog"))
     except EnvironmentError:
         logger.error("can't continue without database connection")
         sys.exit(1)
@@ -105,65 +105,72 @@ def parse_args():
 
 
 class ACLConverter(object):
-    def __init__(self, sheepdog_db, arborist_url):
+    def __init__(self, arborist_url, sheepdog_db=None):
         self.arborist_url = arborist_url.rstrip("/")
         self.programs = set()
         self.projects = dict()
-        self.namespace = os.getenv("AUTH_NAMESPACE", "")
+        self.namespace = "/" + os.getenv("AUTH_NAMESPACE", "").lstrip("/")
         if self.namespace:
             logger.info("using namespace {}".format(self.namespace))
         else:
             logger.info("not using any auth namespace")
         # map resource paths to tags in arborist so we can save http calls
         self.arborist_resources = dict()
+        self.use_sheepdog_db = bool(sheepdog_db)
 
-        engine = create_engine(sheepdog_db, echo=False)
-        try:
-            connection = engine.connect()
-        except OperationalError:
-            raise EnvironmentError(
-                "couldn't connect to sheepdog db using the provided URI"
-            )
-
-        result = connection.execute("SELECT _props->>'name' as name from node_program;")
-        for row in result:
-            self.programs.add(row["name"])
-
-        result = connection.execute("""
-            SELECT
-                project._props->>'name' AS name,
-                program._props->>'name' AS program
-            FROM node_project AS project
-            JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id
-            JOIN node_program AS program ON edge.dst_id = program.node_id;
-        """)
-        for row in result:
-            self.projects[row["name"]] = row["program"]
-        connection.close()
-
-        logger.info("found programs: {}".format(list(self.programs)))
-        projects_log = [
-            "{} (from program {})".format(project, program)
-            for project, program in self.projects.items()
-        ]
-        logger.info("found projects: [{}]".format(", ".join(projects_log)))
+        if sheepdog_db:
+            engine = create_engine(sheepdog_db, echo=False)
+            try:
+                connection = engine.connect()
+            except OperationalError:
+                raise EnvironmentError(
+                    "couldn't connect to sheepdog db using the provided URI"
+                )
+            result = connection.execute("SELECT _props->>'name' as name from node_program;")
+            for row in result:
+                self.programs.add(row["name"])
+            result = connection.execute("""
+                SELECT
+                    project._props->>'name' AS name,
+                    program._props->>'name' AS program
+                FROM node_project AS project
+                JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id
+                JOIN node_program AS program ON edge.dst_id = program.node_id;
+            """)
+            for row in result:
+                self.projects[row["name"]] = row["program"]
+            connection.close()
+            logger.info("found programs: {}".format(list(self.programs)))
+            projects_log = [
+                "{} (from program {})".format(project, program)
+                for project, program in self.projects.items()
+            ]
+            logger.info("found projects: [{}]".format(", ".join(projects_log)))
 
     def is_program(self, acl_item):
         return acl_item in self.programs
 
     def acl_to_authz(self, record):
         path = None
+        programs_found = 0
+        projects_found = 0
         for acl_object in record.acl:
             acl_item = acl_object.ace
             # we'll try to do some sanitizing here since the record ACLs are sometimes
             # really mis-formatted, like `["u'phs000123'"]`, or have spaces left in
             acl_item = acl_item.strip(" ")
             acl_item = acl_item.lstrip("u'")
-            acl_item = re.sub(r"\W+", "", acl_item)
+            if acl_item != "*":
+                acl_item = re.sub(r"\W+", "", acl_item)
             if acl_item == "*":
                 path = "/open"
-            elif not path and self.is_program(acl_item):
+                break
+            elif (
+                not self.use_sheepdog_db
+                or (projects_found == 0 and self.is_program(acl_item))
+            ):
                 path = "/programs/{}".format(acl_item)
+                programs_found += 1
             else:
                 if not acl_item:
                     return None
@@ -174,6 +181,7 @@ def acl_to_authz(self, record):
                 path = "/programs/{}/projects/{}".format(
                     acl_item, self.projects[acl_item]
                 )
+                projects_found += 1
 
         if not path:
             logger.error(
@@ -182,6 +190,11 @@ def acl_to_authz(self, record):
             )
             return None
 
+        if programs_found > 1:
+            logger.error("found multiple projects in ACL for {}".format(record.did))
+        if projects_found > 1:
+            logger.error("found multiple projects in ACL for {}".format(record.did))
+
         if self.namespace:
             path = self.namespace + path
 

From 690a2fc24000ed23534ce30bb70738305ecc57a7 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Thu, 6 Jun 2019 13:39:06 -0500
Subject: [PATCH 10/13] chore(authz-migration): handle empty acl correctly

---
 bin/migrate_acl_authz.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index b4846be8..6024461b 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -162,26 +162,35 @@ def acl_to_authz(self, record):
             acl_item = acl_item.lstrip("u'")
             if acl_item != "*":
                 acl_item = re.sub(r"\W+", "", acl_item)
-            if acl_item == "*":
+
+            # update path based on ACL entry
+            if not acl_item:
+                # ignore empty string
+                continue
+            elif acl_item == "*":
+                # if there's a * it should just be open. return early
                 path = "/open"
                 break
             elif (
                 not self.use_sheepdog_db
                 or (projects_found == 0 and self.is_program(acl_item))
             ):
+                # if we don't have sheepdog we have to assume everything is a "program".
+                # also, we only want to set the path to a program if we haven't found a
+                # path for a project already.
                 path = "/programs/{}".format(acl_item)
                 programs_found += 1
-            else:
-                if not acl_item:
-                    return None
-                if acl_item not in self.projects:
-                    raise EnvironmentError(
-                        "program or project {} does not exist".format(acl_item)
-                    )
+            elif acl_item in self.projects:
+                # always want to update to project if possible
                 path = "/programs/{}/projects/{}".format(
                     acl_item, self.projects[acl_item]
                 )
                 projects_found += 1
+            else:
+                # nothing worked, raise exception
+                raise EnvironmentError(
+                    "program or project {} does not exist".format(acl_item)
+                )
 
         if not path:
             logger.error(

From 138a66c9476c6956e8ef1e26f47db8a199a763f1 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Fri, 7 Jun 2019 12:03:42 -0500
Subject: [PATCH 11/13] chore(authz-migration): refactor start-did

---
 bin/migrate_acl_authz.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index 6024461b..b1c7f46b 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -56,9 +56,14 @@ def main():
 
     with driver.session as session:
         q = session.query(IndexRecord)
-        if getattr(args, "start_did"):
-            q = q.filter(IndexRecord.did >= args.start_did)
-        for record in windowed_query(session, q, IndexRecord.did, int(args.chunk_size)):
+        wq = windowed_query(
+            session,
+            q,
+            IndexRecord.did,
+            int(args.chunk_size),
+            start=getattr(args, "start_did")
+        )
+        for record in wq:
             if not record.acl:
                 logger.info(
                     "record {} has no acl, setting authz to empty"
@@ -249,7 +254,7 @@ def acl_to_authz(self, record):
         return self.arborist_resources[path]
 
 
-def column_windows(session, column, windowsize):
+def column_windows(session, column, windowsize, start=None):
 
     def int_for_range(start_id, end_id):
         if end_id:
@@ -262,6 +267,8 @@ def int_for_range(start_id, end_id):
         .query(column, func.row_number().over(order_by=column).label('rownum'))
         .from_self(column)
     )
+    if start:
+        q = q.filter(column >= start)
     if windowsize > 1:
         q = q.filter(sqlalchemy.text("rownum %% %d=1" % windowsize))
 
@@ -276,8 +283,8 @@ def int_for_range(start_id, end_id):
         yield int_for_range(start, end)
 
 
-def windowed_query(session, q, column, windowsize):
-    for whereclause in column_windows(q.session, column, windowsize):
+def windowed_query(session, q, column, windowsize, start=None):
+    for whereclause in column_windows(q.session, column, windowsize, start=start):
         for row in q.filter(whereclause).order_by(column):
             yield row
         session.commit()

From 26ced98ea7feaf52bd8614dbe53481adebecee71 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Fri, 7 Jun 2019 12:09:16 -0500
Subject: [PATCH 12/13] chore(authz-migration): add log for start-did

---
 bin/migrate_acl_authz.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index b1c7f46b..1e990c09 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -54,6 +54,9 @@ def main():
         logger.error("can't continue without database connection")
         sys.exit(1)
 
+    if hasattr(args, "start_did"):
+        logger.info("starting at did {}".format(args.start_did))
+
     with driver.session as session:
         q = session.query(IndexRecord)
         wq = windowed_query(

From f8f6ebb027adf337349160293c8b8c85aba51979 Mon Sep 17 00:00:00 2001
From: Rudyard Richter <rudyardrichter@uchicago.edu>
Date: Fri, 7 Jun 2019 14:43:00 -0500
Subject: [PATCH 13/13] chore(authz-migration): fix logs

---
 bin/migrate_acl_authz.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py
index 1e990c09..1c3eb2be 100644
--- a/bin/migrate_acl_authz.py
+++ b/bin/migrate_acl_authz.py
@@ -203,12 +203,12 @@ def acl_to_authz(self, record):
         if not path:
             logger.error(
                 "couldn't get `authz` for record {} from {}; setting as empty"
-                .format(record.did, record.acl)
+                .format(record.did, record.acl.ace)
             )
             return None
 
         if programs_found > 1:
-            logger.error("found multiple projects in ACL for {}".format(record.did))
+            logger.error("found multiple programs in ACL for {}".format(record.did))
         if projects_found > 1:
             logger.error("found multiple projects in ACL for {}".format(record.did))