Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(authz-migration): add migration script #218

Merged
merged 14 commits into from
Jun 7, 2019
298 changes: 298 additions & 0 deletions bin/migrate_acl_authz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
"""
This script is used to migrate the `acl` field in indexd to the new `authz` field which
will be used in combination with arborist to handle access control on indexd records.

The `authz` field should consist of a list of resource tags (as defined by
arborist---see arborist's readme at https://github.com/uc-cdis/arborist for more info),
with the meaning that a user trying to access the data file pointed to by this record
must have access to all the resources listed. These resources may be projects or consent
codes or some other mechanism for specifying authorization.

In terms of the migration, it isn't discernable from indexd itself whether the items
listed in the `acl` are programs or projects. For this reason we need access to the
sheepdog tables with "core data"/"metadata", so we can look up which is which. Then, if
the record previously had both a program and a project, since the authz field requires
access to all the listed items, only the project should end up in `authz` (since
requiring the program would omit access to users who can access only the project).

Furthermore, there are two ways to represent the arborist resources that go into
`authz`: the path (human-readable string) and the tag (random string, pseudo-UUID). The
tags are what we want to ultimately put into the `authz` field, since these are
persistent whereas the path could change if resources are renamed.
"""

import argparse
import os
import re
import sys

from cdislogging import get_logger
import requests
import sqlalchemy
from sqlalchemy import and_, func
from sqlalchemy.engine import create_engine
from sqlalchemy.exc import OperationalError

from indexd.index.drivers.alchemy import IndexRecord, IndexRecordAuthz


logger = get_logger("migrate_acl_authz")


def main():
args = parse_args()
sys.path.append(args.path)
try:
from local_settings import settings
except ImportError:
logger.info("Can't import local_settings, import from default")
from indexd.default_settings import settings
driver = settings["config"]["INDEX"]["driver"]
try:
acl_converter = ACLConverter(args.arborist, getattr(args, "sheepdog"))
except EnvironmentError:
logger.error("can't continue without database connection")
sys.exit(1)

if hasattr(args, "start_did"):
logger.info("starting at did {}".format(args.start_did))

with driver.session as session:
q = session.query(IndexRecord)
wq = windowed_query(
session,
q,
IndexRecord.did,
int(args.chunk_size),
start=getattr(args, "start_did")
)
for record in wq:
if not record.acl:
logger.info(
"record {} has no acl, setting authz to empty"
.format(record.did)
)
record.authz = []
continue
try:
authz = acl_converter.acl_to_authz(record)
if authz:
record.authz = [IndexRecordAuthz(did=record.did, resource=authz)]
logger.info("updated authz for {} to {}".format(record.did, authz))
else:
record.authz = []
logger.info("updated authz for {} to empty list".format(record.did))
session.add(record)
except EnvironmentError as e:
msg = "problem adding authz for record {}: {}".format(record.did, e)
logger.error(msg)
logger.info("finished migrating")
return


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--path", default="/var/www/indexd/", help="path to find local_settings.py",
)
parser.add_argument(
"--sheepdog-db", dest="sheepdog", help="URI for the sheepdog database"
Avantol13 marked this conversation as resolved.
Show resolved Hide resolved
)
parser.add_argument(
"--arborist-url", dest="arborist", help="URL for the arborist service"
)
parser.add_argument(
"--chunk-size", dest="chunk_size", type=int, default=1000,
help="number of records to process at once",
)
parser.add_argument(
"--start-did", dest="start_did",
Avantol13 marked this conversation as resolved.
Show resolved Hide resolved
help="did to start at (records processed in lexographical order)",
)
return parser.parse_args()


class ACLConverter(object):
def __init__(self, arborist_url, sheepdog_db=None):
self.arborist_url = arborist_url.rstrip("/")
self.programs = set()
self.projects = dict()
self.namespace = "/" + os.getenv("AUTH_NAMESPACE", "").lstrip("/")
if self.namespace:
logger.info("using namespace {}".format(self.namespace))
else:
logger.info("not using any auth namespace")
# map resource paths to tags in arborist so we can save http calls
self.arborist_resources = dict()
self.use_sheepdog_db = bool(sheepdog_db)

if sheepdog_db:
engine = create_engine(sheepdog_db, echo=False)
try:
connection = engine.connect()
except OperationalError:
raise EnvironmentError(
"couldn't connect to sheepdog db using the provided URI"
)
result = connection.execute("SELECT _props->>'name' as name from node_program;")
for row in result:
self.programs.add(row["name"])
result = connection.execute("""
SELECT
project._props->>'name' AS name,
program._props->>'name' AS program
FROM node_project AS project
JOIN edge_projectmemberofprogram AS edge ON edge.src_id = project.node_id
JOIN node_program AS program ON edge.dst_id = program.node_id;
""")
for row in result:
self.projects[row["name"]] = row["program"]
connection.close()
logger.info("found programs: {}".format(list(self.programs)))
projects_log = [
"{} (from program {})".format(project, program)
for project, program in self.projects.items()
]
logger.info("found projects: [{}]".format(", ".join(projects_log)))

def is_program(self, acl_item):
return acl_item in self.programs

def acl_to_authz(self, record):
path = None
programs_found = 0
projects_found = 0
for acl_object in record.acl:
acl_item = acl_object.ace
Avantol13 marked this conversation as resolved.
Show resolved Hide resolved
# we'll try to do some sanitizing here since the record ACLs are sometimes
# really mis-formatted, like `["u'phs000123'"]`, or have spaces left in
acl_item = acl_item.strip(" ")
acl_item = acl_item.lstrip("u'")
if acl_item != "*":
acl_item = re.sub(r"\W+", "", acl_item)

# update path based on ACL entry
if not acl_item:
# ignore empty string
continue
elif acl_item == "*":
# if there's a * it should just be open. return early
path = "/open"
break
elif (
not self.use_sheepdog_db
or (projects_found == 0 and self.is_program(acl_item))
):
# if we don't have sheepdog we have to assume everything is a "program".
# also, we only want to set the path to a program if we haven't found a
# path for a project already.
path = "/programs/{}".format(acl_item)
Avantol13 marked this conversation as resolved.
Show resolved Hide resolved
programs_found += 1
elif acl_item in self.projects:
# always want to update to project if possible
path = "/programs/{}/projects/{}".format(
acl_item, self.projects[acl_item]
)
projects_found += 1
else:
# nothing worked, raise exception
raise EnvironmentError(
"program or project {} does not exist".format(acl_item)
)

if not path:
logger.error(
"couldn't get `authz` for record {} from {}; setting as empty"
.format(record.did, record.acl.ace)
)
return None

if programs_found > 1:
logger.error("found multiple programs in ACL for {}".format(record.did))
if projects_found > 1:
logger.error("found multiple projects in ACL for {}".format(record.did))

if self.namespace:
path = self.namespace + path
Avantol13 marked this conversation as resolved.
Show resolved Hide resolved

if path not in self.arborist_resources:
# add `?p` to create parent resources as necessary
url = "{}/resource/?p".format(self.arborist_url)
try:
resource = {"path": path}
response = requests.post(url, timeout=5, json=resource)
Avantol13 marked this conversation as resolved.
Show resolved Hide resolved
except requests.exceptions.Timeout:
logger.error(
"couldn't hit arborist to look up resource (timed out): {}".format(url)
)
raise EnvironmentError("couldn't reach arborist; request timed out")
tag = None
try:
logger.debug(
"got {} from arborist: {}"
.format(response.status_code, response.json())
)
if response.status_code == 409:
Avantol13 marked this conversation as resolved.
Show resolved Hide resolved
# resource is already there, so we'll just take the tag
tag = response.json()["exists"]["tag"]
elif response.status_code != 201:
logger.error(
"couldn't hit arborist at {} to create resource (got {}): {}".format(
url, response.status_code, response.json()
)
)
raise EnvironmentError("got unexpected response from arborist")
else:
# just created the resource for the first time
tag = response.json()["created"]["tag"]
except (ValueError, KeyError) as e:
raise EnvironmentError(
"couldn't understand response from arborist: {}".format(e)
)
if not tag:
raise EnvironmentError("couldn't reach arborist")
self.arborist_resources[path] = tag
logger.info("using tag {} for path {}".format(tag, path))

return self.arborist_resources[path]


def column_windows(session, column, windowsize, start=None):

def int_for_range(start_id, end_id):
if end_id:
return and_(column >= start_id, column < end_id)
else:
return column >= start_id

q = (
session
.query(column, func.row_number().over(order_by=column).label('rownum'))
.from_self(column)
)
if start:
q = q.filter(column >= start)
if windowsize > 1:
q = q.filter(sqlalchemy.text("rownum %% %d=1" % windowsize))

intervals = [id for id, in q]

while intervals:
start = intervals.pop(0)
if intervals:
end = intervals[0]
else:
end = None
yield int_for_range(start, end)


def windowed_query(session, q, column, windowsize, start=None):
for whereclause in column_windows(q.session, column, windowsize, start=start):
for row in q.filter(whereclause).order_by(column):
yield row
session.commit()
logger.info("committed progress to database")


if __name__ == "__main__":
main()