From b13c39538deaf01b400c633439951263f35d9540 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Fri, 4 Oct 2019 12:33:40 -0500 Subject: [PATCH 01/15] feat(dbgap-sync): support consent code exchange area access --- fence/config-default.yaml | 14 ++++ fence/sync/sync_users.py | 90 +++++++++++++++------ tests/dbgap_sync/test_user_sync.py | 125 ++++++++++++++++++++++++++++- tests/test-fence-config.yaml | 12 +++ 4 files changed, 214 insertions(+), 27 deletions(-) diff --git a/fence/config-default.yaml b/fence/config-default.yaml index 25f832ba8..a2cc583f7 100644 --- a/fence/config-default.yaml +++ b/fence/config-default.yaml @@ -370,12 +370,26 @@ dbGaP: proxy_user: '' protocol: 'sftp' decrypt_key: '' + # parse out the consent from the dbgap accession number such that something + # like "phs000123.v1.p1.c2" becomes "phs000123.c2" + # + # NOTE: when this is "false" the above would become "phs000123" parse_consent_code: true + # A consent of "c999" indicates access to that study's "exchange area data" + # and when a user has access to one study's exchange area data, they + # have access to "common exchange area data" that is not study specific. + # The below configurations are for the Fence project name a user gets access + # to and the Arborist resource they get access to. + parse_exchange_area_code: false + common_exchange_area_project: 'common_exchange' # A mapping from the dbgap study to which authorization namespaces the actual data # lives in. For example, `studyX` data may exist in multiple organizations, so # we need to know to map authorization to all orgs resources study_to_resource_namespaces: '_default': ['/'] + 'common_exchange': ['/dbgap/'] + # above are for default support and exchange area support + # below are further examples you can safely remove 'studyX': ['/orgA/', '/orgB/'] 'studyY': ['/orgB/', '/orgC/'] 'studyZ': ['/orgD/'] diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 5bc7f9eb1..cd5584347 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -299,6 +299,10 @@ def __init__( self.protocol = dbGaP["protocol"] self.dbgap_key = dbGaP["decrypt_key"] self.parse_consent_code = dbGaP.get("parse_consent_code", True) + self.parse_exchange_area_code = dbGaP.get("parse_exchange_area_code", False) + self.common_exchange_area_project = dbGaP.get( + "common_exchange_area_project", "common_exchange" + ) self.session = db_session self.driver = SQLAlchemyDriver(DB) self.project_mapping = project_mapping or {} @@ -455,7 +459,32 @@ def _parse_csv(self, file_dict, sess, encrypted=True): dbgap_project = phsid[0] if len(phsid) > 1 and self.parse_consent_code: consent_code = phsid[-1] - if consent_code != "c999": + + # c999 indicates access to a study-specific exchange area + # access to at least one study-specific exchange implies access + # to the common exchange area + self.logger.debug( + f"got consent code {consent_code} from dbGaP project " + "{dbgap_project}" + ) + if consent_code == "c999" and self.parse_exchange_area_code: + self.logger.info( + "found study with consent c999 and Fence " + "is configured to parse exchange area data. Giving user " + f"{username} {privileges} privileges in project: " + f"{self.common_exchange_area_project}." + ) + self._add_dbgap_project_for_user( + self.common_exchange_area_project, + privileges, + username, + sess, + user_projects, + ) + + # we want to add the dbgap project if it's a normal consent code + # or if we're parsing the exchange area code + if consent_code != "c999" or self.parse_exchange_area_code: dbgap_project += "." + consent_code display_name = row.get("user name", "") @@ -467,31 +496,9 @@ def _parse_csv(self, file_dict, sess, encrypted=True): } if dbgap_project not in self.project_mapping: - if dbgap_project not in self._projects: - - self.logger.debug( - "creating Project in fence from dbGaP study: {}".format( - dbgap_project - ) - ) - - project = self._get_or_create( - sess, Project, auth_id=dbgap_project - ) - - # need to add dbgap project to arborist - if self.arborist_client: - self._add_dbgap_study_to_arborist(dbgap_project) - - if project.name is None: - project.name = dbgap_project - self._projects[dbgap_project] = project - phsid_privileges = {dbgap_project: set(privileges)} - if username in user_projects: - user_projects[username].update(phsid_privileges) - else: - user_projects[username] = phsid_privileges - + self._add_dbgap_project_for_user( + dbgap_project, privileges, username, sess, user_projects + ) for element_dict in self.project_mapping.get(dbgap_project, []): try: phsid_privileges = { @@ -511,6 +518,33 @@ def _parse_csv(self, file_dict, sess, encrypted=True): self.logger.info(e) return user_projects, user_info + def _add_dbgap_project_for_user( + self, dbgap_project, privileges, username, sess, user_projects + ): + """ + Helper function for csv parsing that adds a given dbgap project to Fence/Arborist + and then updates the dictionary containing all user's project access + """ + if dbgap_project not in self._projects: + self.logger.debug( + "creating Project in fence for dbGaP study: {}".format(dbgap_project) + ) + + project = self._get_or_create(sess, Project, auth_id=dbgap_project) + + # need to add dbgap project to arborist + if self.arborist_client: + self._add_dbgap_study_to_arborist(dbgap_project) + + if project.name is None: + project.name = dbgap_project + self._projects[dbgap_project] = project + phsid_privileges = {dbgap_project: set(privileges)} + if username in user_projects: + user_projects[username].update(phsid_privileges) + else: + user_projects[username] = phsid_privileges + @staticmethod def sync_two_user_info_dict(user_info1, user_info2): """ @@ -976,6 +1010,7 @@ def _sync(self, sess): self.sync_two_user_info_dict(user_info_csv, user_info) # privilleges in yaml files overide ones in csv files + self.logger.error(user_yaml.projects) self.sync_two_phsids_dict(user_yaml.projects, user_projects) self.sync_two_user_info_dict(user_yaml.user_info, user_info) @@ -1259,12 +1294,15 @@ def _add_dbgap_study_to_arborist(self, dbgap_study): .get("study_to_resource_namespaces", {}) .get("_default", ["/"]) ) + self.logger.debug(config["dbGaP"]["study_to_resource_namespaces"]) namespaces = ( config["dbGaP"] .get("study_to_resource_namespaces", {}) .get(dbgap_study, default_namespaces) ) + self.logger.debug(f"dbgap study namespaces: {namespaces}") + arborist_resource_namespaces = [ namespace.rstrip("/") + "/programs/" for namespace in namespaces ] diff --git a/tests/dbgap_sync/test_user_sync.py b/tests/dbgap_sync/test_user_sync.py index 85c4692fe..3660c5116 100644 --- a/tests/dbgap_sync/test_user_sync.py +++ b/tests/dbgap_sync/test_user_sync.py @@ -4,7 +4,7 @@ from fence import models from fence.sync.sync_users import _format_policy_id - +from fence.config import config from tests.dbgap_sync.conftest import LOCAL_YAML_DIR @@ -95,6 +95,129 @@ def test_sync(syncer, db_session, storage_client): assert not user_access +@pytest.mark.parametrize("syncer", ["google"], indirect=True) +@pytest.mark.parametrize("parse_exchange_area_config", [False, True]) +@pytest.mark.parametrize("parse_consent_code_config", [False, True]) +def test_dbgap_consent_codes( + syncer, + db_session, + storage_client, + parse_exchange_area_config, + parse_consent_code_config, + monkeypatch, +): + # patch the sync to use the parameterized value for whether or not to parse exchange + # area data + monkeypatch.setattr(syncer, "parse_exchange_area_code", parse_exchange_area_config) + monkeypatch.setattr(syncer, "parse_consent_code", parse_consent_code_config) + monkeypatch.setattr(syncer, "project_mapping", {}) + + syncer.sync() + + user = models.query_for_user(session=db_session, username="USERC") + if parse_consent_code_config: + if parse_exchange_area_config: + assert user.project_access == { + "phs000179.c1": ["read-storage"], + "phs000178.c999": ["read-storage"], + # should additionally include the study-specific exchange area access and + # access to the common exchange area + config["dbGaP"].get("common_exchange_area_project", ""): [ + "read-storage" + ], + } + else: + assert user.project_access == { + "phs000179.c1": ["read-storage"], + # behavior for c999 when exchange area parsing is NOT enabled is to + # simply provide access to the study itself without a consent group + "phs000178": ["read-storage"], + } + else: + assert user.project_access == { + "phs000178": ["read-storage"], + "phs000179": ["read-storage"], + } + + user = models.query_for_user(session=db_session, username="USERF") + if parse_consent_code_config: + assert user.project_access == { + "phs000178.c1": ["read-storage"], + "phs000178.c2": ["read-storage"], + } + else: + assert user.project_access == {"phs000178": ["read-storage"]} + + user = models.query_for_user(session=db_session, username="TESTUSERB") + if parse_consent_code_config: + assert user.project_access == { + "phs000178.c1": ["read-storage"], + "phs000179.c1": ["read-storage"], + } + else: + assert user.project_access == { + "phs000178": ["read-storage"], + "phs000179": ["read-storage"], + } + + user = models.query_for_user(session=db_session, username="TESTUSERD") + if parse_consent_code_config: + assert user.project_access == {"phs000179.c1": ["read-storage"]} + else: + assert user.project_access == {"phs000179": ["read-storage"]} + + resource_to_parent_paths = {} + for call in syncer.arborist_client.update_resource.call_args_list: + args, kwargs = call + parent_path = args[0] + resource = args[1].get("name") + resource_to_parent_paths.setdefault(resource, []).append(parent_path) + + if parse_consent_code_config: + if parse_exchange_area_config: + assert "phs000178.c999" in resource_to_parent_paths + assert resource_to_parent_paths["phs000178.c999"] == ["/orgA/programs/"] + + assert ( + config["dbGaP"].get("common_exchange_area_project", "") + in resource_to_parent_paths + ) + assert resource_to_parent_paths[ + config["dbGaP"].get("common_exchange_area_project", "") + ] == ["/dbgap/programs/"] + else: + # behavior for c999 when exchange area parsing is NOT enabled is to + # simply provide access to the study itself without a consent group + assert "phs000178" in resource_to_parent_paths + # NOTE: this study is configured to have multiple names in the dbgap config + assert resource_to_parent_paths["phs000178"] == [ + "/orgA/programs/", + "/orgB/programs/", + "/programs/", + ] + + if parse_consent_code_config: + assert "phs000178.c1" in resource_to_parent_paths + assert resource_to_parent_paths["phs000178.c1"] == ["/orgA/programs/"] + + assert "phs000178.c2" in resource_to_parent_paths + assert resource_to_parent_paths["phs000178.c2"] == ["/orgA/programs/"] + + assert "phs000179.c1" in resource_to_parent_paths + assert resource_to_parent_paths["phs000179.c1"] == ["/orgA/programs/"] + else: + assert "phs000178" in resource_to_parent_paths + # NOTE: this study is configured to have multiple names in the dbgap config + assert resource_to_parent_paths["phs000178"] == [ + "/orgA/programs/", + "/orgB/programs/", + "/programs/", + ] + + assert "phs000179" in resource_to_parent_paths + assert resource_to_parent_paths["phs000179"] == ["/orgA/programs/"] + + @pytest.mark.parametrize("syncer", ["google", "cleversafe"], indirect=True) def test_sync_from_files(syncer, db_session, storage_client): sess = db_session diff --git a/tests/test-fence-config.yaml b/tests/test-fence-config.yaml index 970f1470e..abcfa8a63 100644 --- a/tests/test-fence-config.yaml +++ b/tests/test-fence-config.yaml @@ -276,12 +276,24 @@ dbGaP: proxy_user: '' protocol: 'sftp' decrypt_key: '' + # parse out the consent from the dbgap accession number such that something + # like "phs000123.v1.p1.c2" becomes "phs000123.c2" + # + # NOTE: when this is "false" the above would become "phs000123" parse_consent_code: true + # A consent of "c999" indicates access to that study's "exchange area data" + # and when a user has access to one study's exchange area data, they + # have access to "common exchange area data" that is not study specific. + # The below configurations are for the Fence project name a user gets access + # to and the Arborist resource they get access to. + parse_exchange_area_code: false + common_exchange_area_project: 'common_exchange' # A mapping from the dbgap study to which authorization namespaces the actual data # lives in. For example, `studyX` data may exist in multiple organizations, so # we need to know to map authorization to all orgs resources study_to_resource_namespaces: '_default': ['/orgA/'] + 'common_exchange': ['/dbgap/'] 'phs000178': ['/orgA/', '/orgB/', '/'] # ////////////////////////////////////////////////////////////////////////////////////// From 66c2769117a65f0cecfe6d245e1fd1d04f2e1754 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Fri, 4 Oct 2019 14:11:48 -0500 Subject: [PATCH 02/15] feat(dbgap-sync): add pi tag if dbgap telemetry file has that info --- fence/sync/sync_users.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index cd5584347..c07540255 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -488,11 +488,21 @@ def _parse_csv(self, file_dict, sess, encrypted=True): dbgap_project += "." + consent_code display_name = row.get("user name", "") + tags = {"dbgap_role": row.get("role", "")} + + # some dbgap telemetry files have information about a researchers PI + if "downloader for" in row: + tags["pi"] = row["downloader for"] + + # prefer name over previous "downloader for" if it exists + if "downloader for names" in row: + tags["pi"] = row["downloader for names"] + user_info[username] = { "email": row.get("email", ""), "display_name": display_name, "phone_number": row.get("phone", ""), - "tags": {"dbgap_role": row.get("role", "")}, + "tags": tags, } if dbgap_project not in self.project_mapping: From 1f6271bce271e1857adcbf1ca2e275336dca27b0 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Fri, 4 Oct 2019 14:19:24 -0500 Subject: [PATCH 03/15] fix(tests): update tests based on newly added tag-parsing for dbgap sync --- tests/dbgap_sync/test_user_sync.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dbgap_sync/test_user_sync.py b/tests/dbgap_sync/test_user_sync.py index 3660c5116..45d436e6b 100644 --- a/tests/dbgap_sync/test_user_sync.py +++ b/tests/dbgap_sync/test_user_sync.py @@ -48,7 +48,7 @@ def test_sync(syncer, db_session, storage_client): assert len(users) == 11 tags = db_session.query(models.Tag).all() - assert len(tags) == 7 + assert len(tags) == 11 proj = db_session.query(models.Project).all() assert len(proj) == 9 From ccc52c70a5599354906af3b242342738a02a2c61 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Fri, 4 Oct 2019 15:38:10 -0500 Subject: [PATCH 04/15] fix(dbgap-sync): remove line added for debugging --- fence/sync/sync_users.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index c07540255..01a63cfed 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -1020,7 +1020,6 @@ def _sync(self, sess): self.sync_two_user_info_dict(user_info_csv, user_info) # privilleges in yaml files overide ones in csv files - self.logger.error(user_yaml.projects) self.sync_two_phsids_dict(user_yaml.projects, user_projects) self.sync_two_user_info_dict(user_yaml.user_info, user_info) From 23cf91c26e307db64f51feb432ba91ce22884bf7 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Mon, 7 Oct 2019 10:58:48 -0500 Subject: [PATCH 05/15] Update config-default.yaml --- fence/config-default.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fence/config-default.yaml b/fence/config-default.yaml index a2cc583f7..76da59681 100644 --- a/fence/config-default.yaml +++ b/fence/config-default.yaml @@ -371,7 +371,7 @@ dbGaP: protocol: 'sftp' decrypt_key: '' # parse out the consent from the dbgap accession number such that something - # like "phs000123.v1.p1.c2" becomes "phs000123.c2" + # like "phs000123.v1.p1.c2" becomes "phs000123.c2". # # NOTE: when this is "false" the above would become "phs000123" parse_consent_code: true @@ -668,4 +668,4 @@ DREAM_CHALLENGE_TEAM: 'DREAM' DREAM_CHALLENGE_GROUP: 'DREAM' SYNAPSE_URI: 'https://repo-prod.prod.sagebase.org/auth/v1/' SYNAPSE_DISCOVERY_URL: -SYNAPSE_AUTHZ_TTL: 86400 \ No newline at end of file +SYNAPSE_AUTHZ_TTL: 86400 From 3efd85843fe0f89c47ed63a3e2df02b97b95b697 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Tue, 8 Oct 2019 09:41:32 -0500 Subject: [PATCH 06/15] chore(dbgap-sync): remove line added for debugging --- fence/sync/sync_users.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 01a63cfed..0f40f5aaf 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -1303,7 +1303,6 @@ def _add_dbgap_study_to_arborist(self, dbgap_study): .get("study_to_resource_namespaces", {}) .get("_default", ["/"]) ) - self.logger.debug(config["dbGaP"]["study_to_resource_namespaces"]) namespaces = ( config["dbGaP"] .get("study_to_resource_namespaces", {}) From 6791ddf5c65e23d8f4bbcaa7b732f86e13c8dcd8 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Wed, 16 Oct 2019 13:17:36 -0500 Subject: [PATCH 07/15] feat(usersync): adjust c999 handling to account for fact that their is no overarching common exchange area, just parent study common exchange areas --- fence/config-default.yaml | 52 +++++++++++++++++++++++++++++++++------ fence/sync/sync_users.py | 14 ++++++----- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/fence/config-default.yaml b/fence/config-default.yaml index 76da59681..d98ef9a70 100644 --- a/fence/config-default.yaml +++ b/fence/config-default.yaml @@ -375,19 +375,55 @@ dbGaP: # # NOTE: when this is "false" the above would become "phs000123" parse_consent_code: true - # A consent of "c999" indicates access to that study's "exchange area data" + # A consent of "c999" can indicate access to that study's "exchange area data" # and when a user has access to one study's exchange area data, they - # have access to "common exchange area data" that is not study specific. - # The below configurations are for the Fence project name a user gets access - # to and the Arborist resource they get access to. + # have access to the parent study's "common exchange area data" that is not study + # specific. The following config is whether or not to parse/handle "c999" codes + # for exchange area data parse_exchange_area_code: false - common_exchange_area_project: 'common_exchange' - # A mapping from the dbgap study to which authorization namespaces the actual data - # lives in. For example, `studyX` data may exist in multiple organizations, so + # The below configuration is a mapping from studies to their "common exchange area data" + # Fence project name a user gets access to when parsing c999 exchange area codes (and + # subsequently gives access to an arborist resource representing this common area + # as well) + study_common_exchange_areas: + 'phs000920': 'topmed_common_exchange_area' + 'phs000921': 'topmed_common_exchange_area' + 'phs000946': 'topmed_common_exchange_area' + 'phs000951': 'topmed_common_exchange_area' + 'phs000954': 'topmed_common_exchange_area' + 'phs000956': 'topmed_common_exchange_area' + 'phs000964': 'topmed_common_exchange_area' + 'phs000972': 'topmed_common_exchange_area' + 'phs000974': 'topmed_common_exchange_area' + 'phs000988': 'topmed_common_exchange_area' + 'phs000993': 'topmed_common_exchange_area' + 'phs000997': 'topmed_common_exchange_area' + 'phs001024': 'topmed_common_exchange_area' + 'phs001032': 'topmed_common_exchange_area' + 'phs001040': 'topmed_common_exchange_area' + 'phs001062': 'topmed_common_exchange_area' + 'phs001143': 'topmed_common_exchange_area' + 'phs001189': 'topmed_common_exchange_area' + 'phs001207': 'topmed_common_exchange_area' + 'phs001211': 'topmed_common_exchange_area' + 'phs001215': 'topmed_common_exchange_area' + 'phs001217': 'topmed_common_exchange_area' + 'phs001218': 'topmed_common_exchange_area' + 'phs001237': 'topmed_common_exchange_area' + 'phs001293': 'topmed_common_exchange_area' + 'phs001345': 'topmed_common_exchange_area' + 'phs001359': 'topmed_common_exchange_area' + 'phs001368': 'topmed_common_exchange_area' + 'phs001387': 'topmed_common_exchange_area' + 'phs001402': 'topmed_common_exchange_area' + 'phs001412': 'topmed_common_exchange_area' + 'phs001416': 'topmed_common_exchange_area' + # A mapping from the dbgap study / Fence project to which authorization namespaces the + # actual data lives in. For example, `studyX` data may exist in multiple organizations, so # we need to know to map authorization to all orgs resources study_to_resource_namespaces: '_default': ['/'] - 'common_exchange': ['/dbgap/'] + 'topmed_common_exchange_area': ['/dbgap/'] # above are for default support and exchange area support # below are further examples you can safely remove 'studyX': ['/orgA/', '/orgB/'] diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 0f40f5aaf..62b3a0936 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -300,9 +300,7 @@ def __init__( self.dbgap_key = dbGaP["decrypt_key"] self.parse_consent_code = dbGaP.get("parse_consent_code", True) self.parse_exchange_area_code = dbGaP.get("parse_exchange_area_code", False) - self.common_exchange_area_project = dbGaP.get( - "common_exchange_area_project", "common_exchange" - ) + self.study_common_exchange_areas = dbGaP.get("study_common_exchange_areas", {}) self.session = db_session self.driver = SQLAlchemyDriver(DB) self.project_mapping = project_mapping or {} @@ -467,15 +465,19 @@ def _parse_csv(self, file_dict, sess, encrypted=True): f"got consent code {consent_code} from dbGaP project " "{dbgap_project}" ) - if consent_code == "c999" and self.parse_exchange_area_code: + if ( + consent_code == "c999" + and self.parse_exchange_area_code + and dbgap_project in self.study_common_exchange_areas + ): self.logger.info( "found study with consent c999 and Fence " "is configured to parse exchange area data. Giving user " f"{username} {privileges} privileges in project: " - f"{self.common_exchange_area_project}." + f"{self.study_common_exchange_areas[dbgap_project]}." ) self._add_dbgap_project_for_user( - self.common_exchange_area_project, + self.study_common_exchange_areas[dbgap_project], privileges, username, sess, From 5fd1f2fc855afc455b693199e92a4a09b1cce560 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Thu, 17 Oct 2019 11:49:11 -0500 Subject: [PATCH 08/15] fix(usersync): fix logging to include dbgap project information --- fence/sync/sync_users.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 62b3a0936..6072ca023 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -463,7 +463,7 @@ def _parse_csv(self, file_dict, sess, encrypted=True): # to the common exchange area self.logger.debug( f"got consent code {consent_code} from dbGaP project " - "{dbgap_project}" + f"{dbgap_project}" ) if ( consent_code == "c999" From 4d983987a6bc2a1b897d78460d0e956539e85f7d Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Fri, 18 Oct 2019 16:11:08 -0500 Subject: [PATCH 09/15] feat(usersync): handle c999 as providing access to all consents, deprecate existing handling giving access to phsid --- fence/config-default.yaml | 59 ++++++--------- fence/sync/sync_users.py | 73 ++++++++++++++++-- tests/dbgap_sync/conftest.py | 9 ++- tests/dbgap_sync/test_user_sync.py | 116 ++++++++++++++++++----------- tests/test-fence-config.yaml | 26 ++++--- 5 files changed, 185 insertions(+), 98 deletions(-) diff --git a/fence/config-default.yaml b/fence/config-default.yaml index d98ef9a70..817b72ead 100644 --- a/fence/config-default.yaml +++ b/fence/config-default.yaml @@ -383,52 +383,35 @@ dbGaP: parse_exchange_area_code: false # The below configuration is a mapping from studies to their "common exchange area data" # Fence project name a user gets access to when parsing c999 exchange area codes (and - # subsequently gives access to an arborist resource representing this common area + # subsequently gives access to an Arborist resource representing this common area # as well) study_common_exchange_areas: - 'phs000920': 'topmed_common_exchange_area' - 'phs000921': 'topmed_common_exchange_area' - 'phs000946': 'topmed_common_exchange_area' - 'phs000951': 'topmed_common_exchange_area' - 'phs000954': 'topmed_common_exchange_area' - 'phs000956': 'topmed_common_exchange_area' - 'phs000964': 'topmed_common_exchange_area' - 'phs000972': 'topmed_common_exchange_area' - 'phs000974': 'topmed_common_exchange_area' - 'phs000988': 'topmed_common_exchange_area' - 'phs000993': 'topmed_common_exchange_area' - 'phs000997': 'topmed_common_exchange_area' - 'phs001024': 'topmed_common_exchange_area' - 'phs001032': 'topmed_common_exchange_area' - 'phs001040': 'topmed_common_exchange_area' - 'phs001062': 'topmed_common_exchange_area' - 'phs001143': 'topmed_common_exchange_area' - 'phs001189': 'topmed_common_exchange_area' - 'phs001207': 'topmed_common_exchange_area' - 'phs001211': 'topmed_common_exchange_area' - 'phs001215': 'topmed_common_exchange_area' - 'phs001217': 'topmed_common_exchange_area' - 'phs001218': 'topmed_common_exchange_area' - 'phs001237': 'topmed_common_exchange_area' - 'phs001293': 'topmed_common_exchange_area' - 'phs001345': 'topmed_common_exchange_area' - 'phs001359': 'topmed_common_exchange_area' - 'phs001368': 'topmed_common_exchange_area' - 'phs001387': 'topmed_common_exchange_area' - 'phs001402': 'topmed_common_exchange_area' - 'phs001412': 'topmed_common_exchange_area' - 'phs001416': 'topmed_common_exchange_area' + 'example': 'test_common_exchange_area' + # 'studyX': 'test_common_exchange_area' + # 'studyY': 'test_common_exchange_area' + # 'studyZ': 'test_common_exchange_area' # A mapping from the dbgap study / Fence project to which authorization namespaces the # actual data lives in. For example, `studyX` data may exist in multiple organizations, so # we need to know to map authorization to all orgs resources study_to_resource_namespaces: '_default': ['/'] - 'topmed_common_exchange_area': ['/dbgap/'] + 'test_common_exchange_area': ['/dbgap/'] # above are for default support and exchange area support - # below are further examples you can safely remove - 'studyX': ['/orgA/', '/orgB/'] - 'studyY': ['/orgB/', '/orgC/'] - 'studyZ': ['/orgD/'] + # below are further examples + # + # 'studyX': ['/orgA/', '/orgB/'] + # 'studyX.c2': ['/orgB/', '/orgC/'] + # 'studyZ': ['/orgD/'] + +# Regex to match an assession number that has consent information in forms like: +# phs00301123.c999 +# phs000123.v3.p1.c3 +# phs000123.c3 +# phs00301123.v3.p4.c999 +# Will NOT MATCH forms like: phs000123 +# +# WARNING: Do not change this without consulting the code that uses it +DBGAP_ACCESSION_WITH_CONSENT_REGEX: '(?Pphs[0-9]+)(.(?Pv[0-9]+)){0,1}(.(?Pp[0-9]+)){0,1}.(?Pc[0-9]+)' # ////////////////////////////////////////////////////////////////////////////////////// # STORAGE BACKENDS AND CREDENTIALS diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 6072ca023..4c96bcebb 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -6,6 +6,7 @@ import subprocess as sp import tempfile import yaml +import copy from contextlib import contextmanager from csv import DictReader from io import StringIO @@ -458,9 +459,14 @@ def _parse_csv(self, file_dict, sess, encrypted=True): if len(phsid) > 1 and self.parse_consent_code: consent_code = phsid[-1] - # c999 indicates access to a study-specific exchange area + # c999 indicates full access to all consents and access + # to a study-specific exchange area # access to at least one study-specific exchange implies access # to the common exchange area + # + # NOTE: Handling giving access to all consents is done at + # a later time, when we have full information about possible + # consents self.logger.debug( f"got consent code {consent_code} from dbGaP project " f"{dbgap_project}" @@ -484,10 +490,7 @@ def _parse_csv(self, file_dict, sess, encrypted=True): user_projects, ) - # we want to add the dbgap project if it's a normal consent code - # or if we're parsing the exchange area code - if consent_code != "c999" or self.parse_exchange_area_code: - dbgap_project += "." + consent_code + dbgap_project += "." + consent_code display_name = row.get("user name", "") tags = {"dbgap_role": row.get("role", "")} @@ -978,6 +981,10 @@ def _sync(self, sess): self.logger.info("dbgap files: {}".format(dbgap_file_list)) permissions = [{"read-storage"} for _ in dbgap_file_list] + if self.parse_consent_code and self.parse_exchange_area_code: + self.logger.info( + f"using study to common exchange area mapping: {self.study_common_exchange_areas}" + ) user_projects, user_info = self._parse_csv( dict(list(zip(dbgap_file_list, permissions))), encrypted=True, sess=sess ) @@ -1021,10 +1028,15 @@ def _sync(self, sess): self.sync_two_phsids_dict(user_projects_csv, user_projects) self.sync_two_user_info_dict(user_info_csv, user_info) - # privilleges in yaml files overide ones in csv files + # privileges in yaml files overide ones in csv files self.sync_two_phsids_dict(user_yaml.projects, user_projects) self.sync_two_user_info_dict(user_yaml.user_info, user_info) + if self.parse_consent_code and self.parse_exchange_area_code: + self._grant_all_consents_to_c999_users( + user_projects, user_yaml.project_to_resource + ) + if user_projects: self.logger.info("Sync to db and storage backend") self.sync_to_db_and_storage_backend(user_projects, user_info, sess) @@ -1061,6 +1073,55 @@ def _sync(self, sess): ) exit(1) + def _grant_all_consents_to_c999_users( + self, user_projects, user_yaml_project_to_resources + ): + access_number_matcher = re.compile(config["DBGAP_ACCESSION_WITH_CONSENT_REGEX"]) + # combine dbgap/user.yaml projects into one big list (in case not all consents + # are in either) + all_projects = [] + all_projects.extend([project for project, _ in self._projects.items()]) + all_projects.extend( + [project for project, _ in user_yaml_project_to_resources.items()] + ) + + self.logger.debug(f"all projects: {all_projects}") + + # construct a mapping from phsid (without consent) to all accessions with consent + consent_mapping = {} + for project in all_projects: + phs_match = access_number_matcher.match(project) + if phs_match: + accession_number = phs_match.groupdict() + + # TODO: This is not handling the .v1.p1 at all + consent_mapping.setdefault(accession_number["phsid"], []).append( + ".".join([accession_number["phsid"], accession_number["consent"]]) + ) + + self.logger.debug(f"consent mapping: {consent_mapping}") + + # go through existing access and find any c999's and make sure to give access to + # all accessions with consent for that phsid + for username, user_project_info in copy.deepcopy(user_projects).items(): + for project, permissions in copy.deepcopy(user_project_info).items(): + phs_match = access_number_matcher.match(project) + if phs_match and phs_match.groupdict()["consent"] == "c999": + # give access to all consents + all_phsids_with_consent = consent_mapping.get( + phs_match.groupdict()["phsid"], [] + ) + self.logger.info( + f"user {username} has c999 consent group for: {project}. " + f"Granting access to all consents: {all_phsids_with_consent}" + ) + # NOTE: Only giving read-storage at the moment (this is same + # permission we give for other dbgap projects) + for phsid_with_consent in all_phsids_with_consent: + user_projects[username].update( + {phsid_with_consent: {"read-storage"}} + ) + def _update_arborist(self, session, user_yaml): """ Create roles, resources, policies, groups in arborist from the information in diff --git a/tests/dbgap_sync/conftest.py b/tests/dbgap_sync/conftest.py index 3f3a26e81..2fe829865 100644 --- a/tests/dbgap_sync/conftest.py +++ b/tests/dbgap_sync/conftest.py @@ -120,7 +120,14 @@ def syncer(db_session, request): "phstest": [{"name": "Test", "auth_id": "Test"}], } - dbGap = {} + dbGap = yaml_load( + open( + os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + "test-fence-config.yaml", + ) + ) + ).get("dbGaP") test_db = yaml_load( open( os.path.join( diff --git a/tests/dbgap_sync/test_user_sync.py b/tests/dbgap_sync/test_user_sync.py index 45d436e6b..94cf5d7e6 100644 --- a/tests/dbgap_sync/test_user_sync.py +++ b/tests/dbgap_sync/test_user_sync.py @@ -40,37 +40,58 @@ def test_sync_incorrect_user_yaml_file(syncer, monkeypatch, db_session): @pytest.mark.parametrize("syncer", ["google", "cleversafe"], indirect=True) -def test_sync(syncer, db_session, storage_client): +@pytest.mark.parametrize("parse_consent_code_config", [False, True]) +def test_sync( + syncer, db_session, storage_client, parse_consent_code_config, monkeypatch +): + # patch the sync to use the parameterized config value + monkeypatch.setattr(syncer, "parse_consent_code", parse_consent_code_config) syncer.sync() users = db_session.query(models.User).all() assert len(users) == 11 - tags = db_session.query(models.Tag).all() - assert len(tags) == 11 + if parse_consent_code_config: + user = models.query_for_user(session=db_session, username="USERC") + assert user.project_access == { + "phs000178.c1": ["read-storage"], + "phs000178.c2": ["read-storage"], + "phs000178.c999": ["read-storage"], + "phs000179.c1": ["read-storage"], + } - proj = db_session.query(models.Project).all() - assert len(proj) == 9 + user = models.query_for_user(session=db_session, username="USERF") + assert user.project_access == { + "phs000178.c1": ["read-storage"], + "phs000178.c2": ["read-storage"], + } - user = models.query_for_user(session=db_session, username="USERC") - assert user.project_access == { - "phs000178": ["read-storage"], - "TCGA-PCAWG": ["read-storage"], - "phs000179.c1": ["read-storage"], - } + user = models.query_for_user(session=db_session, username="TESTUSERB") + assert user.project_access == { + "phs000179.c1": ["read-storage"], + "phs000178.c1": ["read-storage"], + } + else: + user = models.query_for_user(session=db_session, username="USERC") + assert user.project_access == { + "phs000178": ["read-storage"], + "TCGA-PCAWG": ["read-storage"], + "phs000179": ["read-storage"], + } - user = models.query_for_user(session=db_session, username="USERF") - assert user.project_access == { - "phs000178.c1": ["read-storage"], - "phs000178.c2": ["read-storage"], - } + user = models.query_for_user(session=db_session, username="USERF") + assert user.project_access == { + "phs000178": ["read-storage"], + "TCGA-PCAWG": ["read-storage"], + } - user = models.query_for_user(session=db_session, username="TESTUSERB") - assert user.project_access == { - "phs000179.c1": ["read-storage"], - "phs000178.c1": ["read-storage"], - } + user = models.query_for_user(session=db_session, username="TESTUSERB") + assert user.project_access == { + "phs000178": ["read-storage"], + "TCGA-PCAWG": ["read-storage"], + "phs000179": ["read-storage"], + } user = models.query_for_user(session=db_session, username="TESTUSERD") assert user.display_name == "USER D" @@ -119,19 +140,20 @@ def test_dbgap_consent_codes( if parse_exchange_area_config: assert user.project_access == { "phs000179.c1": ["read-storage"], + "phs000178.c1": ["read-storage"], + "phs000178.c2": ["read-storage"], "phs000178.c999": ["read-storage"], # should additionally include the study-specific exchange area access and # access to the common exchange area - config["dbGaP"].get("common_exchange_area_project", ""): [ - "read-storage" - ], + "test_common_exchange_area": ["read-storage"], } else: assert user.project_access == { "phs000179.c1": ["read-storage"], - # behavior for c999 when exchange area parsing is NOT enabled is to - # simply provide access to the study itself without a consent group - "phs000178": ["read-storage"], + # c999 gives access to all consents + "phs000178.c1": ["read-storage"], + "phs000178.c2": ["read-storage"], + "phs000178.c999": ["read-storage"], } else: assert user.project_access == { @@ -178,19 +200,17 @@ def test_dbgap_consent_codes( assert "phs000178.c999" in resource_to_parent_paths assert resource_to_parent_paths["phs000178.c999"] == ["/orgA/programs/"] - assert ( - config["dbGaP"].get("common_exchange_area_project", "") - in resource_to_parent_paths - ) - assert resource_to_parent_paths[ - config["dbGaP"].get("common_exchange_area_project", "") - ] == ["/dbgap/programs/"] + assert "test_common_exchange_area" in resource_to_parent_paths + assert resource_to_parent_paths["test_common_exchange_area"] == [ + "/dbgap/programs/" + ] else: - # behavior for c999 when exchange area parsing is NOT enabled is to - # simply provide access to the study itself without a consent group - assert "phs000178" in resource_to_parent_paths - # NOTE: this study is configured to have multiple names in the dbgap config - assert resource_to_parent_paths["phs000178"] == [ + # c999 gives access to all consents + assert "phs000178.c1" in resource_to_parent_paths + assert "phs000178.c2" in resource_to_parent_paths + assert "phs000178.c999" in resource_to_parent_paths + # NOTE: this study+consent is configured to have multiple names in the dbgap config + assert resource_to_parent_paths["phs000178.c2"] == [ "/orgA/programs/", "/orgB/programs/", "/programs/", @@ -200,8 +220,16 @@ def test_dbgap_consent_codes( assert "phs000178.c1" in resource_to_parent_paths assert resource_to_parent_paths["phs000178.c1"] == ["/orgA/programs/"] + # NOTE: this study+consent is configured to have multiple names in the dbgap config assert "phs000178.c2" in resource_to_parent_paths - assert resource_to_parent_paths["phs000178.c2"] == ["/orgA/programs/"] + assert resource_to_parent_paths["phs000178.c2"] == [ + "/orgA/programs/", + "/orgB/programs/", + "/programs/", + ] + + assert "phs000178.c999" in resource_to_parent_paths + assert resource_to_parent_paths["phs000178.c999"] == ["/orgA/programs/"] assert "phs000179.c1" in resource_to_parent_paths assert resource_to_parent_paths["phs000179.c1"] == ["/orgA/programs/"] @@ -424,15 +452,15 @@ def test_update_arborist(syncer, db_session): # one project is configured to point to two different arborist resource # parent paths (/orgA/ and /orgB/ and /) - project_with_mult_namespaces = "phs000178" + projects_with_mult_namespaces = ["phs000178.c2"] expect_resources = [ "phs000179.c1", "phs000178.c1", "phs000178.c2", - "TCGA-PCAWG", + "phs000178.c999", "data_file", # comes from user.yaml file - project_with_mult_namespaces, ] + expect_resources.extend(projects_with_mult_namespaces) resource_to_parent_paths = {} for call in syncer.arborist_client.update_resource.call_args_list: @@ -445,7 +473,7 @@ def test_update_arborist(syncer, db_session): assert resource in list(resource_to_parent_paths.keys()) if resource == "data_file": assert resource_to_parent_paths[resource] == ["/"] - elif resource == project_with_mult_namespaces: + elif resource in projects_with_mult_namespaces: assert resource_to_parent_paths[resource] == [ "/orgA/programs/", "/orgB/programs/", diff --git a/tests/test-fence-config.yaml b/tests/test-fence-config.yaml index abcfa8a63..8e8fd748e 100644 --- a/tests/test-fence-config.yaml +++ b/tests/test-fence-config.yaml @@ -277,24 +277,32 @@ dbGaP: protocol: 'sftp' decrypt_key: '' # parse out the consent from the dbgap accession number such that something - # like "phs000123.v1.p1.c2" becomes "phs000123.c2" + # like "phs000123.v1.p1.c2" becomes "phs000123.c2". # # NOTE: when this is "false" the above would become "phs000123" parse_consent_code: true - # A consent of "c999" indicates access to that study's "exchange area data" + # A consent of "c999" can indicate access to that study's "exchange area data" # and when a user has access to one study's exchange area data, they - # have access to "common exchange area data" that is not study specific. - # The below configurations are for the Fence project name a user gets access - # to and the Arborist resource they get access to. + # have access to the parent study's "common exchange area data" that is not study + # specific. The following config is whether or not to parse/handle "c999" codes + # for exchange area data parse_exchange_area_code: false - common_exchange_area_project: 'common_exchange' - # A mapping from the dbgap study to which authorization namespaces the actual data - # lives in. For example, `studyX` data may exist in multiple organizations, so + # The below configuration is a mapping from studies to their "common exchange area data" + # Fence project name a user gets access to when parsing c999 exchange area codes (and + # subsequently gives access to an arborist resource representing this common area + # as well) + study_common_exchange_areas: + 'phs000178': 'test_common_exchange_area' + # A mapping from the dbgap study / Fence project to which authorization namespaces the + # actual data lives in. For example, `studyX` data may exist in multiple organizations, so # we need to know to map authorization to all orgs resources study_to_resource_namespaces: '_default': ['/orgA/'] - 'common_exchange': ['/dbgap/'] + 'test_common_exchange_area': ['/dbgap/'] + # study when not parsing consent codes 'phs000178': ['/orgA/', '/orgB/', '/'] + # study when parsing consent codes + 'phs000178.c2': ['/orgA/', '/orgB/', '/'] # ////////////////////////////////////////////////////////////////////////////////////// # STORAGE BACKENDS AND CREDENTIALS From 50ab077f10959a70447844afc344692ce0a74b9a Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Fri, 18 Oct 2019 16:34:54 -0500 Subject: [PATCH 10/15] feat(usersync): cleanup tests, fix logic for c999 handling --- fence/sync/sync_users.py | 2 +- tests/dbgap_sync/test_user_sync.py | 1 - tests/test-fence-config.yaml | 10 ++++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 1bc368706..2caa20480 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -1034,7 +1034,7 @@ def _sync(self, sess): self.sync_two_phsids_dict(user_yaml.projects, user_projects) self.sync_two_user_info_dict(user_yaml.user_info, user_info) - if self.parse_consent_code and self.parse_exchange_area_code: + if self.parse_consent_code: self._grant_all_consents_to_c999_users( user_projects, user_yaml.project_to_resource ) diff --git a/tests/dbgap_sync/test_user_sync.py b/tests/dbgap_sync/test_user_sync.py index 94cf5d7e6..de8022932 100644 --- a/tests/dbgap_sync/test_user_sync.py +++ b/tests/dbgap_sync/test_user_sync.py @@ -460,7 +460,6 @@ def test_update_arborist(syncer, db_session): "phs000178.c999", "data_file", # comes from user.yaml file ] - expect_resources.extend(projects_with_mult_namespaces) resource_to_parent_paths = {} for call in syncer.arborist_client.update_resource.call_args_list: diff --git a/tests/test-fence-config.yaml b/tests/test-fence-config.yaml index bd5a0aa5a..8e96a6748 100644 --- a/tests/test-fence-config.yaml +++ b/tests/test-fence-config.yaml @@ -311,6 +311,16 @@ dbGaP: # study when parsing consent codes 'phs000178.c2': ['/orgA/', '/orgB/', '/'] +# Regex to match an assession number that has consent information in forms like: +# phs00301123.c999 +# phs000123.v3.p1.c3 +# phs000123.c3 +# phs00301123.v3.p4.c999 +# Will NOT MATCH forms like: phs000123 +# +# WARNING: Do not change this without consulting the code that uses it +DBGAP_ACCESSION_WITH_CONSENT_REGEX: '(?Pphs[0-9]+)(.(?Pv[0-9]+)){0,1}(.(?Pp[0-9]+)){0,1}.(?Pc[0-9]+)' + # ////////////////////////////////////////////////////////////////////////////////////// # STORAGE BACKENDS AND CREDENTIALS # - Optional: Used for `/admin` & `/credentials` endpoints for user management. From e238f0778694afcebcd03980025ba53757e278ce Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Mon, 21 Oct 2019 13:18:53 -0500 Subject: [PATCH 11/15] docs(dbgap): add some docs about our understanding of dbgap --- docs/dbgap_info.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 docs/dbgap_info.md diff --git a/docs/dbgap_info.md b/docs/dbgap_info.md new file mode 100644 index 000000000..663b6f678 --- /dev/null +++ b/docs/dbgap_info.md @@ -0,0 +1,46 @@ +# dbGaP Information (as understood by Gen3) + +The [Database for Genotypes and Phenotypes (dbGaP)](https://www.ncbi.nlm.nih.gov/gap/) is used to "archive and distribute the data and results from studies that have investigated the interaction of genotype and phenotype in Humans". + +> NOTE: For official details about dbGaP please visit the official site/documentation. + +The largest unit of data that can be submitted to dbGaP is a *Study*. Studies can have sub-studies. Each study is identified by a unique study number (AKA phsid AKA study accession) and additional information (like version), which may look something like `phs001826.v1.p1.c1`. The `.` delimites various pieces of information. + +* `phs001826`: unique study identifier +* `v1`: version number +* `p1`: participant set number +* `c1`: consent group + +The combination of these fields is known as a *dbGaP Accession Number*. + +More information about this can be found in [this NCBI article](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2031016/). + +## Authorization + +Fence is capable of syncing user access information from dbGaP's *Telemetry Files* (AKA study whitelists). These are typically provided in an SFTP server as CSV's or TXT's. You can see an example of the format in the Fence unit tests (`/tests/dbgap_sync`). + +A single *Telemetry File* represents the access allowed for a given dbGaP study accession. The file will contain rows of eRA Commons user IDs and other information. Fence is able to parse all *Telemetry Files* in an SFTP server (given credentials for the SFTP server). + +## Consent Groups + +Fence contains a configuration for whether or not to parse consent codes. When parsing consent codes, the authorization resources a user is given access to will be in the form `study_id.consent_group` (ex: `phs001826.c2`). + +### Consent Group `c999` Handling + +The consent group `c999` is interpretted as meaning the user should implicitly have access +to **all** available consent groups within the study. It can additionally be interpretted +as providing access to that study's "exchange area" (in addition to the parent study's +"common exchange area"). + +Fence will consolidate all known consents for a given study and then provide any user +with `c999` access to all those consents (including `.c999` explicitly, to represent +that study's exchange area). + +Fence allows configuring whether or not you want to handle the "common exchange area" logic +mentioned above. When turned on, you can provide a list of study identifiers (ex: `phs000123`, `phs000456`) and the resource you want to represent their parent study's common exchange area (ex: `123_and_456_common_exchange_area`) in Fence's configuration file. + +> See the `config-default.yaml` for more information about available configurations. + +## Version Updates + +A study can be updated and at that time the patients and consent groups may change and the version number `v1` would get bumped up. At the moment, Fence does not handle these versions, so authorization if effectively either study level, or study+consent level. \ No newline at end of file From 2c58dabe548b1c3c075663140168d5ce0f0a49d4 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Mon, 21 Oct 2019 13:21:13 -0500 Subject: [PATCH 12/15] docs(dbgap): use dbgap official terminology --- docs/dbgap_info.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/dbgap_info.md b/docs/dbgap_info.md index 663b6f678..56bb86e72 100644 --- a/docs/dbgap_info.md +++ b/docs/dbgap_info.md @@ -7,9 +7,9 @@ The [Database for Genotypes and Phenotypes (dbGaP)](https://www.ncbi.nlm.nih.gov The largest unit of data that can be submitted to dbGaP is a *Study*. Studies can have sub-studies. Each study is identified by a unique study number (AKA phsid AKA study accession) and additional information (like version), which may look something like `phs001826.v1.p1.c1`. The `.` delimites various pieces of information. * `phs001826`: unique study identifier -* `v1`: version number -* `p1`: participant set number -* `c1`: consent group +* `v1`: data version +* `p1`: participant set version +* `c1`: consent group version The combination of these fields is known as a *dbGaP Accession Number*. From d9b193fc56f5b7f746b6405f4f70c664e3551717 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Mon, 21 Oct 2019 16:04:39 -0500 Subject: [PATCH 13/15] chore(refactor): improve dbgap docs, clarify configuration by updating name --- docs/dbgap_info.md | 31 ++++++++++++++++++++++++++---- fence/config-default.yaml | 7 +++++-- fence/sync/sync_users.py | 8 +++++--- tests/dbgap_sync/test_user_sync.py | 4 +++- tests/test-fence-config.yaml | 2 +- 5 files changed, 41 insertions(+), 11 deletions(-) diff --git a/docs/dbgap_info.md b/docs/dbgap_info.md index 56bb86e72..8cc92e763 100644 --- a/docs/dbgap_info.md +++ b/docs/dbgap_info.md @@ -23,7 +23,11 @@ A single *Telemetry File* represents the access allowed for a given dbGaP study ## Consent Groups -Fence contains a configuration for whether or not to parse consent codes. When parsing consent codes, the authorization resources a user is given access to will be in the form `study_id.consent_group` (ex: `phs001826.c2`). +Fence contains a configuration for whether or not to parse consent codes (at the time of writing, it is `parse_consent_code` in the `dbGaP` block). + +> NOTE: Reference the `config-default.yaml` for current configuration options and further details. + +When parsing consent codes, the authorization resources a user is given access to will be in the form `study_id.consent_group` (ex: `phs001826.c2`). ### Consent Group `c999` Handling @@ -37,10 +41,29 @@ with `c999` access to all those consents (including `.c999` explicitly, to repre that study's exchange area). Fence allows configuring whether or not you want to handle the "common exchange area" logic -mentioned above. When turned on, you can provide a list of study identifiers (ex: `phs000123`, `phs000456`) and the resource you want to represent their parent study's common exchange area (ex: `123_and_456_common_exchange_area`) in Fence's configuration file. +mentioned above (at the time of writing, it is `enable_common_exchange_area_access` in the `dbGaP` block). + +When turned on, you can provide a list of study identifiers (ex: `phs000123`, `phs000456`) and the resource you want to represent their parent study's common exchange area (ex: `123_and_456_common_exchange_area`) in Fence's configuration file (at the time of writing, it is `study_common_exchange_areas` in the `dbGaP` block). + +> NOTE: Again, please see the `config-default.yaml` for more information about available configurations. + +For example, `c999` would be handled slightly differently based on configuration. Below, assume a user has access to `c999` consent group: + +|| **Consent Cfg == True** | **Consent Codes Cfg == False** | +|---| ------------- | ------------- | +| **Common Exchange Cfg == True** | access to: common exchange area (if phsid in cfg mapping) + study-specifc exchange area + all consent codes | c999 ignored, access to phsid w/o consent | +| **Common Exchange Cfg == False** | access to: study-specifc exchange area + all consent codes | c999 ignored, access to phsid w/o consent | + +So the user access granted in a situation with `phs000123.c999` (assuming there exists a +`phs000123.c1` and `phs000123.c2`): + +|| **Consent Cfg == True** | **Consent Codes Cfg == False** | +|---| ------------- | ------------- | +| **Common Exchange Cfg == True** | `test_common_exchange_area` + `phs000123.c999` + `phs000123.c1`, `phs000123.c2` | `phs000123` +| **Common Exchange Cfg == False** | `phs000123.c999` + `phs000123.c1`, `phs000123.c2` | `phs000123` | -> See the `config-default.yaml` for more information about available configurations. +> NOTE: On the resource level, `phs000123.c999` should refer to resources that exist in that study's specific exchange area. Resources in the parent's common exchange area should be controlled via `test_common_exchange_area`. ## Version Updates -A study can be updated and at that time the patients and consent groups may change and the version number `v1` would get bumped up. At the moment, Fence does not handle these versions, so authorization if effectively either study level, or study+consent level. \ No newline at end of file +A study can be updated and at that time the patients and consent groups may change and the version number `v1` would get bumped up. At the moment, Fence does not handle these versions, so authorization is effectively either study level, or study+consent level. \ No newline at end of file diff --git a/fence/config-default.yaml b/fence/config-default.yaml index 402587aa6..9e28eb3ff 100644 --- a/fence/config-default.yaml +++ b/fence/config-default.yaml @@ -392,8 +392,11 @@ dbGaP: # and when a user has access to one study's exchange area data, they # have access to the parent study's "common exchange area data" that is not study # specific. The following config is whether or not to parse/handle "c999" codes - # for exchange area data - parse_exchange_area_code: false + # for access to the common exchange area data + # + # NOTE: When enabled you MUST also provide a mapping to the + # `study_common_exchange_areas` from study -> parent common exchange area resource + enable_common_exchange_area_access: false # The below configuration is a mapping from studies to their "common exchange area data" # Fence project name a user gets access to when parsing c999 exchange area codes (and # subsequently gives access to an Arborist resource representing this common area diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 2caa20480..53bd1acfd 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -300,7 +300,9 @@ def __init__( self.protocol = dbGaP["protocol"] self.dbgap_key = dbGaP["decrypt_key"] self.parse_consent_code = dbGaP.get("parse_consent_code", True) - self.parse_exchange_area_code = dbGaP.get("parse_exchange_area_code", False) + self.enable_common_exchange_area_access = dbGaP.get( + "enable_common_exchange_area_access", False + ) self.study_common_exchange_areas = dbGaP.get("study_common_exchange_areas", {}) self.session = db_session self.driver = SQLAlchemyDriver(DB) @@ -475,7 +477,7 @@ def _parse_csv(self, file_dict, sess, encrypted=True): ) if ( consent_code == "c999" - and self.parse_exchange_area_code + and self.enable_common_exchange_area_access and dbgap_project in self.study_common_exchange_areas ): self.logger.info( @@ -983,7 +985,7 @@ def _sync(self, sess): self.logger.info("dbgap files: {}".format(dbgap_file_list)) permissions = [{"read-storage"} for _ in dbgap_file_list] - if self.parse_consent_code and self.parse_exchange_area_code: + if self.parse_consent_code and self.enable_common_exchange_area_access: self.logger.info( f"using study to common exchange area mapping: {self.study_common_exchange_areas}" ) diff --git a/tests/dbgap_sync/test_user_sync.py b/tests/dbgap_sync/test_user_sync.py index de8022932..06b0d3653 100644 --- a/tests/dbgap_sync/test_user_sync.py +++ b/tests/dbgap_sync/test_user_sync.py @@ -129,7 +129,9 @@ def test_dbgap_consent_codes( ): # patch the sync to use the parameterized value for whether or not to parse exchange # area data - monkeypatch.setattr(syncer, "parse_exchange_area_code", parse_exchange_area_config) + monkeypatch.setattr( + syncer, "enable_common_exchange_area_access", parse_exchange_area_config + ) monkeypatch.setattr(syncer, "parse_consent_code", parse_consent_code_config) monkeypatch.setattr(syncer, "project_mapping", {}) diff --git a/tests/test-fence-config.yaml b/tests/test-fence-config.yaml index 8e96a6748..2b27be184 100644 --- a/tests/test-fence-config.yaml +++ b/tests/test-fence-config.yaml @@ -293,7 +293,7 @@ dbGaP: # have access to the parent study's "common exchange area data" that is not study # specific. The following config is whether or not to parse/handle "c999" codes # for exchange area data - parse_exchange_area_code: false + enable_common_exchange_area_access: false # The below configuration is a mapping from studies to their "common exchange area data" # Fence project name a user gets access to when parsing c999 exchange area codes (and # subsequently gives access to an arborist resource representing this common area From 91f53f0358f6caa0db19223b23180fc06aabeeb3 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Mon, 21 Oct 2019 16:19:59 -0500 Subject: [PATCH 14/15] chore(sync/tests): cleanup logic in sync, clarify/consolidate test(s) --- fence/sync/sync_users.py | 4 ++-- tests/dbgap_sync/test_user_sync.py | 29 ++++++++++++----------------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 53bd1acfd..03509cde5 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -1099,7 +1099,7 @@ def _grant_all_consents_to_c999_users( accession_number = phs_match.groupdict() # TODO: This is not handling the .v1.p1 at all - consent_mapping.setdefault(accession_number["phsid"], []).append( + consent_mapping.setdefault(accession_number["phsid"], set()).add( ".".join([accession_number["phsid"], accession_number["consent"]]) ) @@ -1108,7 +1108,7 @@ def _grant_all_consents_to_c999_users( # go through existing access and find any c999's and make sure to give access to # all accessions with consent for that phsid for username, user_project_info in copy.deepcopy(user_projects).items(): - for project, permissions in copy.deepcopy(user_project_info).items(): + for project, _ in user_project_info.items(): phs_match = access_number_matcher.match(project) if phs_match and phs_match.groupdict()["consent"] == "c999": # give access to all consents diff --git a/tests/dbgap_sync/test_user_sync.py b/tests/dbgap_sync/test_user_sync.py index 06b0d3653..fc93ffdc3 100644 --- a/tests/dbgap_sync/test_user_sync.py +++ b/tests/dbgap_sync/test_user_sync.py @@ -117,20 +117,20 @@ def test_sync( @pytest.mark.parametrize("syncer", ["google"], indirect=True) -@pytest.mark.parametrize("parse_exchange_area_config", [False, True]) +@pytest.mark.parametrize("enable_common_exchange_area", [False, True]) @pytest.mark.parametrize("parse_consent_code_config", [False, True]) def test_dbgap_consent_codes( syncer, db_session, storage_client, - parse_exchange_area_config, + enable_common_exchange_area, parse_consent_code_config, monkeypatch, ): # patch the sync to use the parameterized value for whether or not to parse exchange # area data monkeypatch.setattr( - syncer, "enable_common_exchange_area_access", parse_exchange_area_config + syncer, "enable_common_exchange_area_access", enable_common_exchange_area ) monkeypatch.setattr(syncer, "parse_consent_code", parse_consent_code_config) monkeypatch.setattr(syncer, "project_mapping", {}) @@ -139,7 +139,9 @@ def test_dbgap_consent_codes( user = models.query_for_user(session=db_session, username="USERC") if parse_consent_code_config: - if parse_exchange_area_config: + if enable_common_exchange_area: + # b/c user has c999, ensure they have access to all consents, study-specific + # exchange area (via .c999) and the common exchange area configured assert user.project_access == { "phs000179.c1": ["read-storage"], "phs000178.c1": ["read-storage"], @@ -150,6 +152,8 @@ def test_dbgap_consent_codes( "test_common_exchange_area": ["read-storage"], } else: + # b/c user has c999 but common exchange area is disabled, ensure they have + # access to all consents, study-specific exchange area (via .c999) assert user.project_access == { "phs000179.c1": ["read-storage"], # c999 gives access to all consents @@ -158,6 +162,7 @@ def test_dbgap_consent_codes( "phs000178.c999": ["read-storage"], } else: + # with consent code parsing off, ensure users have access to just phsids assert user.project_access == { "phs000178": ["read-storage"], "phs000179": ["read-storage"], @@ -198,7 +203,9 @@ def test_dbgap_consent_codes( resource_to_parent_paths.setdefault(resource, []).append(parent_path) if parse_consent_code_config: - if parse_exchange_area_config: + if enable_common_exchange_area: + # b/c user has c999, ensure they have access to all consents, study-specific + # exchange area (via .c999) and the common exchange area configured assert "phs000178.c999" in resource_to_parent_paths assert resource_to_parent_paths["phs000178.c999"] == ["/orgA/programs/"] @@ -206,19 +213,7 @@ def test_dbgap_consent_codes( assert resource_to_parent_paths["test_common_exchange_area"] == [ "/dbgap/programs/" ] - else: - # c999 gives access to all consents - assert "phs000178.c1" in resource_to_parent_paths - assert "phs000178.c2" in resource_to_parent_paths - assert "phs000178.c999" in resource_to_parent_paths - # NOTE: this study+consent is configured to have multiple names in the dbgap config - assert resource_to_parent_paths["phs000178.c2"] == [ - "/orgA/programs/", - "/orgB/programs/", - "/programs/", - ] - if parse_consent_code_config: assert "phs000178.c1" in resource_to_parent_paths assert resource_to_parent_paths["phs000178.c1"] == ["/orgA/programs/"] From a2b778942b2ba22b5acfa4d0e175ebb70337d044 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Mon, 21 Oct 2019 16:42:51 -0500 Subject: [PATCH 15/15] chore(refactor): clarify cfg comments, consolidate logic for getting all projects --- fence/config-default.yaml | 2 +- fence/sync/sync_users.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fence/config-default.yaml b/fence/config-default.yaml index 9e28eb3ff..7e94ea3b5 100644 --- a/fence/config-default.yaml +++ b/fence/config-default.yaml @@ -408,7 +408,7 @@ dbGaP: # 'studyZ': 'test_common_exchange_area' # A mapping from the dbgap study / Fence project to which authorization namespaces the # actual data lives in. For example, `studyX` data may exist in multiple organizations, so - # we need to know to map authorization to all orgs resources + # we need to know how to map authorization to all orgs resources study_to_resource_namespaces: '_default': ['/'] 'test_common_exchange_area': ['/dbgap/'] diff --git a/fence/sync/sync_users.py b/fence/sync/sync_users.py index 03509cde5..07aaf6f00 100644 --- a/fence/sync/sync_users.py +++ b/fence/sync/sync_users.py @@ -465,8 +465,8 @@ def _parse_csv(self, file_dict, sess, encrypted=True): # c999 indicates full access to all consents and access # to a study-specific exchange area - # access to at least one study-specific exchange implies access - # to the common exchange area + # access to at least one study-specific exchange area implies access + # to the parent study's common exchange area # # NOTE: Handling giving access to all consents is done at # a later time, when we have full information about possible @@ -1083,10 +1083,8 @@ def _grant_all_consents_to_c999_users( access_number_matcher = re.compile(config["DBGAP_ACCESSION_WITH_CONSENT_REGEX"]) # combine dbgap/user.yaml projects into one big list (in case not all consents # are in either) - all_projects = [] - all_projects.extend([project for project, _ in self._projects.items()]) - all_projects.extend( - [project for project, _ in user_yaml_project_to_resources.items()] + all_projects = set( + list(self._projects.keys()) + list(user_yaml_project_to_resources.keys()) ) self.logger.debug(f"all projects: {all_projects}")