From 63220ca781c3ed6faeb55b05b0145f8db1e96ce9 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Tue, 28 May 2019 16:02:49 -0500 Subject: [PATCH 01/13] chore(authz-migration): add migration script --- bin/migrate_acl_authz.py | 191 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 bin/migrate_acl_authz.py diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py new file mode 100644 index 00000000..38fef876 --- /dev/null +++ b/bin/migrate_acl_authz.py @@ -0,0 +1,191 @@ +""" +This script is used to migrate the `acl` field in indexd to the new `authz` field which +will be used in combination with arborist to handle access control on indexd records. + +The `authz` field should consist of a list of resource tags (as defined by +arborist---see arborist's readme at https://github.com/uc-cdis/arborist for more info), +with the meaning that a user trying to access the data file pointed to by this record +must have access to all the resources listed. These resources may be projects or consent +codes or some other mechanism for specifying authorization. + +In terms of the migration, it isn't discernable from indexd itself whether the items +listed in the `acl` are programs or projects. For this reason we need access to the +sheepdog tables with "core data"/"metadata", so we can look up which is which. Then, if +the record previously had both a program and a project, since the authz field requires +access to all the listed items, only the project should end up in `authz` (since +requiring the program would omit access to users who can access only the project). + +Furthermore, there are two ways to represent the arborist resources that go into +`authz`: the path (human-readable string) and the tag (random string, pseudo-UUID). The +tags are what we want to ultimately put into the `authz` field, since these are +persistent whereas the path could change if resources are renamed. +""" + +import argparse +import sys + +from cdislogging import get_logger +import requests +from sqlalchemy.engine import create_engine +from sqlalchemy.exc import OperationalError + +from indexd.index.drivers.alchemy import IndexRecord, IndexRecordAuthz + + +logger = get_logger("migrate_acl_authz") + + +def main(): + args = parse_args() + sys.path.append(args.path) + try: + from local_settings import settings + except ImportError: + logger.info("Can't import local_settings, import from default") + from indexd.default_settings import settings + driver = settings["config"]["INDEX"]["driver"] + try: + acl_converter = ACLConverter(args.sheepdog, args.arborist) + except EnvironmentError: + logger.error("can't continue without database connection") + sys.exit(1) + with driver.session as session: + records = session.query(IndexRecord) + for record in records: + if not record.acl: + logger.info( + "record {} has no acl, setting authz to empty" + .format(record.did) + ) + record.authz = [] + continue + try: + record.authz = acl_converter.acl_to_authz(record) + session.add(record) + logger.info( + "updated authz for {} to {}" + .format(record.did, record.authz.resource) + ) + except EnvironmentError as e: + msg = "problem adding authz for record {}: {}".format(record.did, e) + logger.error(msg) + logger.info("finished migrating") + return + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", default="/var/www/indexd/", help="path to find local_settings.py", + ) + parser.add_argument( + "--sheepdog-db", dest="sheepdog", help="URI for the sheepdog database" + ) + parser.add_argument( + "--arborist-url", dest="arborist", help="URL for the arborist service" + ) + return parser.parse_args() + + +class ACLConverter(object): + def __init__(self, sheepdog_db, arborist_url): + self.arborist_url = arborist_url.rstrip("/") + self.programs = set() + self.projects = dict() + # map resource paths to tags in arborist so we can save http calls + self.arborist_resources = dict() + + engine = create_engine(sheepdog_db, echo=False) + try: + connection = engine.connect() + except OperationalError: + raise EnvironmentError( + "couldn't connect to sheepdog db using the provided URI" + ) + + result = connection.execute("SELECT _props->>'name' as name from node_program;") + for row in result: + self.programs.add(row["name"]) + + result = connection.execute(""" + SELECT + project._props->>'name' AS name, + program._props->>'name' AS program + FROM node_project AS project + JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id + JOIN node_program AS program ON edge.dst_id = program.node_id; + """) + for row in result: + self.projects[row["name"]] = row["program"] + + connection.close() + return + + def is_program(self, acl_item): + return acl_item in self.programs + + def acl_to_authz(self, record): + path = None + for acl_object in record.acl: + acl_item = acl_object.ace + if acl_item == "*": + path = "/open" + elif not path and self.is_program(acl_item): + path = "/programs/{}".format(acl_item) + else: + if acl_item not in self.projects: + raise EnvironmentError( + "program or project {} does not exist".format(acl_item) + ) + path = "/programs/{}/projects/{}".format( + acl_item, self.projects[acl_item] + ) + + if not path: + logger.error( + "couldn't get `authz` for record {} from {}; setting as empty" + .format(record.did, record.acl) + ) + return [] + + if path not in self.arborist_resources: + url = "{}/resource/".format(self.arborist_url) + failed = False + try: + resource = {"path": path} + response = requests.post(url, timeout=5, json=resource) + except requests.exceptions.Timeout: + logger.error( + "couldn't hit arborist to look up resource (timed out): {}".format(url) + ) + failed = True + tag = None + try: + if response.status_code == 409: + # resource is already there, so we'll just take the tag + tag = response.json()["tag"] + elif response.status_code != 201: + logger.error( + "couldn't hit arborist at {} to create resource (got {}): {}".format( + url, response.status_code, response.json() + ) + ) + failed = True + else: + # just created the resource for the first time + tag = response.json()["created"]["tag"] + except (ValueError, KeyError) as e: + raise EnvironmentError( + "couldn't understand response from arborist: {}".format(e) + ) + + if failed or not tag: + raise EnvironmentError("couldn't reach arborist") + self.arborist_resources[path] = tag + + tag = self.arborist_resources[path] + return [IndexRecordAuthz(did=record.did, resource=tag)] + + +if __name__ == "__main__": + main() From e30df68c82f727465f40ba1573aad2a72605f485 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Thu, 30 May 2019 14:54:15 -0500 Subject: [PATCH 02/13] chore(authz-migration): code review fixes --- bin/migrate_acl_authz.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index 38fef876..8071c53e 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -22,6 +22,8 @@ """ import argparse +import os +import re import sys from cdislogging import get_logger @@ -92,6 +94,7 @@ def __init__(self, sheepdog_db, arborist_url): self.arborist_url = arborist_url.rstrip("/") self.programs = set() self.projects = dict() + self.namespace = os.getenv("AUTH_NAMESPACE", "") # map resource paths to tags in arborist so we can save http calls self.arborist_resources = dict() @@ -128,6 +131,10 @@ def acl_to_authz(self, record): path = None for acl_object in record.acl: acl_item = acl_object.ace + # we'll try to do some sanitizing here since the record ACLs are sometimes + # really mis-formatted, like `["u'phs000123'"]` + acl_item = acl_item.lstrip("u'") + acl_item = re.sub(r"\W+", "", acl_item) if acl_item == "*": path = "/open" elif not path and self.is_program(acl_item): @@ -148,9 +155,11 @@ def acl_to_authz(self, record): ) return [] + if self.namespace: + path = "{}/{}".format(self.namespace, path) + if path not in self.arborist_resources: url = "{}/resource/".format(self.arborist_url) - failed = False try: resource = {"path": path} response = requests.post(url, timeout=5, json=resource) @@ -158,19 +167,19 @@ def acl_to_authz(self, record): logger.error( "couldn't hit arborist to look up resource (timed out): {}".format(url) ) - failed = True + raise EnvironmentError("couldn't reach arborist; request timed out") tag = None try: if response.status_code == 409: # resource is already there, so we'll just take the tag - tag = response.json()["tag"] + tag = response.json()["exists"]["tag"] elif response.status_code != 201: logger.error( "couldn't hit arborist at {} to create resource (got {}): {}".format( url, response.status_code, response.json() ) ) - failed = True + raise EnvironmentError("got unexpected response from arborist") else: # just created the resource for the first time tag = response.json()["created"]["tag"] @@ -178,8 +187,7 @@ def acl_to_authz(self, record): raise EnvironmentError( "couldn't understand response from arborist: {}".format(e) ) - - if failed or not tag: + if not tag: raise EnvironmentError("couldn't reach arborist") self.arborist_resources[path] = tag From 330d470a47dd70acf50fbcff24b70a016271018d Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Thu, 30 May 2019 16:44:48 -0500 Subject: [PATCH 03/13] chore(authz-migration): try windowed approach --- bin/migrate_acl_authz.py | 46 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index 8071c53e..0c23667e 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -28,6 +28,8 @@ from cdislogging import get_logger import requests +import sqlalchemy +from sqlalchemy import and_, func from sqlalchemy.engine import create_engine from sqlalchemy.exc import OperationalError @@ -51,9 +53,10 @@ def main(): except EnvironmentError: logger.error("can't continue without database connection") sys.exit(1) + with driver.session as session: records = session.query(IndexRecord) - for record in records: + for record in windowed_query(records, IndexRecord.did, args.chunk_size): if not record.acl: logger.info( "record {} has no acl, setting authz to empty" @@ -86,6 +89,13 @@ def parse_args(): parser.add_argument( "--arborist-url", dest="arborist", help="URL for the arborist service" ) + parser.add_argument( + "--chunk-size", dest="chunk_size", help="number of records to process at once", + ) + parser.add_argument( + "--start-did", dest="start_did", + help="did to start at (records processed in lexographical order)", + ) return parser.parse_args() @@ -195,5 +205,39 @@ def acl_to_authz(self, record): return [IndexRecordAuthz(did=record.did, resource=tag)] +def column_windows(session, column, windowsize): + + def int_for_range(start_id, end_id): + if end_id: + return and_(column >= start_id, column < end_id) + else: + return column >= start_id + + q = ( + session + .query(column, func.row_number().over(order_by=column).label('rownum')) + .from_self(column) + ) + if windowsize > 1: + q = q.filter(sqlalchemy.text("rownum %% %d=1" % windowsize)) + + intervals = [id for id, in q] + + while intervals: + start = intervals.pop(0) + if intervals: + end = intervals[0] + else: + end = None + yield int_for_range(start, end) + logger.info("doing a commit now") + + +def windowed_query(q, column, windowsize): + for whereclause in column_windows(q.session, column, windowsize): + for row in q.filter(whereclause).order_by(column): + yield row + + if __name__ == "__main__": main() From 39119a824eafd041108889e9fa000f28cafee715 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Thu, 30 May 2019 17:02:48 -0500 Subject: [PATCH 04/13] chore(authz-migration): remove duplicate slash --- bin/migrate_acl_authz.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index 0c23667e..a12e7f4d 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -166,7 +166,7 @@ def acl_to_authz(self, record): return [] if self.namespace: - path = "{}/{}".format(self.namespace, path) + path = self.namespace + path if path not in self.arborist_resources: url = "{}/resource/".format(self.arborist_url) From 28525440874f315c21e3d1e188c9036f607c300d Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Thu, 30 May 2019 17:51:32 -0500 Subject: [PATCH 05/13] chore(authz-migration): reorganize --- bin/migrate_acl_authz.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index a12e7f4d..d1102b59 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -65,12 +65,14 @@ def main(): record.authz = [] continue try: - record.authz = acl_converter.acl_to_authz(record) + authz = acl_converter.acl_to_authz(record) + if authz: + record.authz = [IndexRecordAuthz(did=record.did, resource=authz)] + logger.info("updated authz for {} to {}".format(record.did, authz)) + else: + record.authz = [] + logger.info("updated authz for {} to empty list".format(record.did)) session.add(record) - logger.info( - "updated authz for {} to {}" - .format(record.did, record.authz.resource) - ) except EnvironmentError as e: msg = "problem adding authz for record {}: {}".format(record.did, e) logger.error(msg) @@ -163,7 +165,7 @@ def acl_to_authz(self, record): "couldn't get `authz` for record {} from {}; setting as empty" .format(record.did, record.acl) ) - return [] + return None if self.namespace: path = self.namespace + path @@ -201,8 +203,7 @@ def acl_to_authz(self, record): raise EnvironmentError("couldn't reach arborist") self.arborist_resources[path] = tag - tag = self.arborist_resources[path] - return [IndexRecordAuthz(did=record.did, resource=tag)] + return self.arborist_resources[path] def column_windows(session, column, windowsize): From 28df2db27b161bb86b3108d352ec65db10102298 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Mon, 3 Jun 2019 12:49:44 -0500 Subject: [PATCH 06/13] chore(authz-migration): add ?p --- bin/migrate_acl_authz.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index d1102b59..4a256d38 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -171,7 +171,8 @@ def acl_to_authz(self, record): path = self.namespace + path if path not in self.arborist_resources: - url = "{}/resource/".format(self.arborist_url) + # add `?p` to create parent resources as necessary + url = "{}/resource/?p".format(self.arborist_url) try: resource = {"path": path} response = requests.post(url, timeout=5, json=resource) From 8b6b744450391d886ba142d1b1e4b43a754a4809 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Mon, 3 Jun 2019 15:27:40 -0500 Subject: [PATCH 07/13] chore(authz-migration): more logs and partial commits --- bin/migrate_acl_authz.py | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index 4a256d38..b48b39ce 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -55,8 +55,8 @@ def main(): sys.exit(1) with driver.session as session: - records = session.query(IndexRecord) - for record in windowed_query(records, IndexRecord.did, args.chunk_size): + q = session.query(IndexRecord) + for record in windowed_query(session, q, IndexRecord.did, int(args.chunk_size)): if not record.acl: logger.info( "record {} has no acl, setting authz to empty" @@ -92,7 +92,8 @@ def parse_args(): "--arborist-url", dest="arborist", help="URL for the arborist service" ) parser.add_argument( - "--chunk-size", dest="chunk_size", help="number of records to process at once", + "--chunk-size", dest="chunk_size", type=int, default=1000, + help="number of records to process at once", ) parser.add_argument( "--start-did", dest="start_did", @@ -107,6 +108,10 @@ def __init__(self, sheepdog_db, arborist_url): self.programs = set() self.projects = dict() self.namespace = os.getenv("AUTH_NAMESPACE", "") + if self.namespace: + logger.info("using namespace {}".format(self.namespace)) + else: + logger.info("not using any auth namespace") # map resource paths to tags in arborist so we can save http calls self.arborist_resources = dict() @@ -132,9 +137,14 @@ def __init__(self, sheepdog_db, arborist_url): """) for row in result: self.projects[row["name"]] = row["program"] - connection.close() - return + + logger.info("found programs: {}".format(list(self.programs))) + projects_log = [ + "{} (from program {})".format(project, program) + for project, program in self.projects.items() + ] + logger.info("found projects: [{}]".format(", ".join(projects_log))) def is_program(self, acl_item): return acl_item in self.programs @@ -144,7 +154,8 @@ def acl_to_authz(self, record): for acl_object in record.acl: acl_item = acl_object.ace # we'll try to do some sanitizing here since the record ACLs are sometimes - # really mis-formatted, like `["u'phs000123'"]` + # really mis-formatted, like `["u'phs000123'"]`, or have spaces left in + acl_item = acl_item.strip(" ") acl_item = acl_item.lstrip("u'") acl_item = re.sub(r"\W+", "", acl_item) if acl_item == "*": @@ -152,6 +163,8 @@ def acl_to_authz(self, record): elif not path and self.is_program(acl_item): path = "/programs/{}".format(acl_item) else: + if not acl_item: + return None if acl_item not in self.projects: raise EnvironmentError( "program or project {} does not exist".format(acl_item) @@ -183,6 +196,10 @@ def acl_to_authz(self, record): raise EnvironmentError("couldn't reach arborist; request timed out") tag = None try: + logger.debug( + "got {} from arborist: {}" + .format(response.status_code, response.json()) + ) if response.status_code == 409: # resource is already there, so we'll just take the tag tag = response.json()["exists"]["tag"] @@ -203,6 +220,7 @@ def acl_to_authz(self, record): if not tag: raise EnvironmentError("couldn't reach arborist") self.arborist_resources[path] = tag + logger.info("using tag {} for path {}".format(tag, path)) return self.arborist_resources[path] @@ -232,13 +250,14 @@ def int_for_range(start_id, end_id): else: end = None yield int_for_range(start, end) - logger.info("doing a commit now") -def windowed_query(q, column, windowsize): +def windowed_query(session, q, column, windowsize): for whereclause in column_windows(q.session, column, windowsize): for row in q.filter(whereclause).order_by(column): yield row + session.commit() + logger.info("committed progress to database") if __name__ == "__main__": From 6284424e251bd8f4c355ae97684129fe708bfcca Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Wed, 5 Jun 2019 11:31:09 -0500 Subject: [PATCH 08/13] chore(authz-migration): add start-did logic --- bin/migrate_acl_authz.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index b48b39ce..e8e6009c 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -56,6 +56,8 @@ def main(): with driver.session as session: q = session.query(IndexRecord) + if getattr(args, "start_did"): + q = q.filter(IndexRecord.did >= args.start_did) for record in windowed_query(session, q, IndexRecord.did, int(args.chunk_size)): if not record.acl: logger.info( From 5b0e571de3413b48936e1d956d1b41c6c35d1a08 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Wed, 5 Jun 2019 13:50:26 -0500 Subject: [PATCH 09/13] chore(authz-migration): log pathological cases --- bin/migrate_acl_authz.py | 83 +++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index e8e6009c..b4846be8 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -49,7 +49,7 @@ def main(): from indexd.default_settings import settings driver = settings["config"]["INDEX"]["driver"] try: - acl_converter = ACLConverter(args.sheepdog, args.arborist) + acl_converter = ACLConverter(args.arborist, getattr(args, "sheepdog")) except EnvironmentError: logger.error("can't continue without database connection") sys.exit(1) @@ -105,65 +105,72 @@ def parse_args(): class ACLConverter(object): - def __init__(self, sheepdog_db, arborist_url): + def __init__(self, arborist_url, sheepdog_db=None): self.arborist_url = arborist_url.rstrip("/") self.programs = set() self.projects = dict() - self.namespace = os.getenv("AUTH_NAMESPACE", "") + self.namespace = "/" + os.getenv("AUTH_NAMESPACE", "").lstrip("/") if self.namespace: logger.info("using namespace {}".format(self.namespace)) else: logger.info("not using any auth namespace") # map resource paths to tags in arborist so we can save http calls self.arborist_resources = dict() + self.use_sheepdog_db = bool(sheepdog_db) - engine = create_engine(sheepdog_db, echo=False) - try: - connection = engine.connect() - except OperationalError: - raise EnvironmentError( - "couldn't connect to sheepdog db using the provided URI" - ) - - result = connection.execute("SELECT _props->>'name' as name from node_program;") - for row in result: - self.programs.add(row["name"]) - - result = connection.execute(""" - SELECT - project._props->>'name' AS name, - program._props->>'name' AS program - FROM node_project AS project - JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id - JOIN node_program AS program ON edge.dst_id = program.node_id; - """) - for row in result: - self.projects[row["name"]] = row["program"] - connection.close() - - logger.info("found programs: {}".format(list(self.programs))) - projects_log = [ - "{} (from program {})".format(project, program) - for project, program in self.projects.items() - ] - logger.info("found projects: [{}]".format(", ".join(projects_log))) + if sheepdog_db: + engine = create_engine(sheepdog_db, echo=False) + try: + connection = engine.connect() + except OperationalError: + raise EnvironmentError( + "couldn't connect to sheepdog db using the provided URI" + ) + result = connection.execute("SELECT _props->>'name' as name from node_program;") + for row in result: + self.programs.add(row["name"]) + result = connection.execute(""" + SELECT + project._props->>'name' AS name, + program._props->>'name' AS program + FROM node_project AS project + JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id + JOIN node_program AS program ON edge.dst_id = program.node_id; + """) + for row in result: + self.projects[row["name"]] = row["program"] + connection.close() + logger.info("found programs: {}".format(list(self.programs))) + projects_log = [ + "{} (from program {})".format(project, program) + for project, program in self.projects.items() + ] + logger.info("found projects: [{}]".format(", ".join(projects_log))) def is_program(self, acl_item): return acl_item in self.programs def acl_to_authz(self, record): path = None + programs_found = 0 + projects_found = 0 for acl_object in record.acl: acl_item = acl_object.ace # we'll try to do some sanitizing here since the record ACLs are sometimes # really mis-formatted, like `["u'phs000123'"]`, or have spaces left in acl_item = acl_item.strip(" ") acl_item = acl_item.lstrip("u'") - acl_item = re.sub(r"\W+", "", acl_item) + if acl_item != "*": + acl_item = re.sub(r"\W+", "", acl_item) if acl_item == "*": path = "/open" - elif not path and self.is_program(acl_item): + break + elif ( + not self.use_sheepdog_db + or (projects_found == 0 and self.is_program(acl_item)) + ): path = "/programs/{}".format(acl_item) + programs_found += 1 else: if not acl_item: return None @@ -174,6 +181,7 @@ def acl_to_authz(self, record): path = "/programs/{}/projects/{}".format( acl_item, self.projects[acl_item] ) + projects_found += 1 if not path: logger.error( @@ -182,6 +190,11 @@ def acl_to_authz(self, record): ) return None + if programs_found > 1: + logger.error("found multiple projects in ACL for {}".format(record.did)) + if projects_found > 1: + logger.error("found multiple projects in ACL for {}".format(record.did)) + if self.namespace: path = self.namespace + path From 690a2fc24000ed23534ce30bb70738305ecc57a7 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Thu, 6 Jun 2019 13:39:06 -0500 Subject: [PATCH 10/13] chore(authz-migration): handle empty acl correctly --- bin/migrate_acl_authz.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index b4846be8..6024461b 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -162,26 +162,35 @@ def acl_to_authz(self, record): acl_item = acl_item.lstrip("u'") if acl_item != "*": acl_item = re.sub(r"\W+", "", acl_item) - if acl_item == "*": + + # update path based on ACL entry + if not acl_item: + # ignore empty string + continue + elif acl_item == "*": + # if there's a * it should just be open. return early path = "/open" break elif ( not self.use_sheepdog_db or (projects_found == 0 and self.is_program(acl_item)) ): + # if we don't have sheepdog we have to assume everything is a "program". + # also, we only want to set the path to a program if we haven't found a + # path for a project already. path = "/programs/{}".format(acl_item) programs_found += 1 - else: - if not acl_item: - return None - if acl_item not in self.projects: - raise EnvironmentError( - "program or project {} does not exist".format(acl_item) - ) + elif acl_item in self.projects: + # always want to update to project if possible path = "/programs/{}/projects/{}".format( acl_item, self.projects[acl_item] ) projects_found += 1 + else: + # nothing worked, raise exception + raise EnvironmentError( + "program or project {} does not exist".format(acl_item) + ) if not path: logger.error( From 138a66c9476c6956e8ef1e26f47db8a199a763f1 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Fri, 7 Jun 2019 12:03:42 -0500 Subject: [PATCH 11/13] chore(authz-migration): refactor start-did --- bin/migrate_acl_authz.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index 6024461b..b1c7f46b 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -56,9 +56,14 @@ def main(): with driver.session as session: q = session.query(IndexRecord) - if getattr(args, "start_did"): - q = q.filter(IndexRecord.did >= args.start_did) - for record in windowed_query(session, q, IndexRecord.did, int(args.chunk_size)): + wq = windowed_query( + session, + q, + IndexRecord.did, + int(args.chunk_size), + start=getattr(args, "start_did") + ) + for record in wq: if not record.acl: logger.info( "record {} has no acl, setting authz to empty" @@ -249,7 +254,7 @@ def acl_to_authz(self, record): return self.arborist_resources[path] -def column_windows(session, column, windowsize): +def column_windows(session, column, windowsize, start=None): def int_for_range(start_id, end_id): if end_id: @@ -262,6 +267,8 @@ def int_for_range(start_id, end_id): .query(column, func.row_number().over(order_by=column).label('rownum')) .from_self(column) ) + if start: + q = q.filter(column >= start) if windowsize > 1: q = q.filter(sqlalchemy.text("rownum %% %d=1" % windowsize)) @@ -276,8 +283,8 @@ def int_for_range(start_id, end_id): yield int_for_range(start, end) -def windowed_query(session, q, column, windowsize): - for whereclause in column_windows(q.session, column, windowsize): +def windowed_query(session, q, column, windowsize, start=None): + for whereclause in column_windows(q.session, column, windowsize, start=start): for row in q.filter(whereclause).order_by(column): yield row session.commit() From 26ced98ea7feaf52bd8614dbe53481adebecee71 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Fri, 7 Jun 2019 12:09:16 -0500 Subject: [PATCH 12/13] chore(authz-migration): add log for start-did --- bin/migrate_acl_authz.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index b1c7f46b..1e990c09 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -54,6 +54,9 @@ def main(): logger.error("can't continue without database connection") sys.exit(1) + if hasattr(args, "start_did"): + logger.info("starting at did {}".format(args.start_did)) + with driver.session as session: q = session.query(IndexRecord) wq = windowed_query( From f8f6ebb027adf337349160293c8b8c85aba51979 Mon Sep 17 00:00:00 2001 From: Rudyard Richter Date: Fri, 7 Jun 2019 14:43:00 -0500 Subject: [PATCH 13/13] chore(authz-migration): fix logs --- bin/migrate_acl_authz.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/migrate_acl_authz.py b/bin/migrate_acl_authz.py index 1e990c09..1c3eb2be 100644 --- a/bin/migrate_acl_authz.py +++ b/bin/migrate_acl_authz.py @@ -203,12 +203,12 @@ def acl_to_authz(self, record): if not path: logger.error( "couldn't get `authz` for record {} from {}; setting as empty" - .format(record.did, record.acl) + .format(record.did, record.acl.ace) ) return None if programs_found > 1: - logger.error("found multiple projects in ACL for {}".format(record.did)) + logger.error("found multiple programs in ACL for {}".format(record.did)) if projects_found > 1: logger.error("found multiple projects in ACL for {}".format(record.did))