diff --git a/.github/workflows/build-dev.yaml b/.github/workflows/build-dev.yaml index 689f42f5..4bb3aca3 100644 --- a/.github/workflows/build-dev.yaml +++ b/.github/workflows/build-dev.yaml @@ -85,12 +85,11 @@ jobs: with: context: . tags: ${{ steps.vars.outputs.BE_NAMESPACE }}/fragalysis-backend:${{ env.GITHUB_REF_SLUG }} - - name: Test - run: > - docker-compose -f docker-compose.test.yml up - --build - --exit-code-from tests - --abort-on-container-exit + - name: Test (docker compose) + uses: hoverkraft-tech/compose-action@v2.0.1 + with: + compose-file: ./docker-compose.test.yml + up-flags: --build --exit-code-from tests --abort-on-container-exit env: BE_NAMESPACE: ${{ steps.vars.outputs.BE_NAMESPACE }} BE_IMAGE_TAG: ${{ env.GITHUB_REF_SLUG }} diff --git a/.github/workflows/build-production.yaml b/.github/workflows/build-production.yaml index b49d06df..11cec835 100644 --- a/.github/workflows/build-production.yaml +++ b/.github/workflows/build-production.yaml @@ -134,12 +134,11 @@ jobs: tags: | ${{ steps.vars.outputs.BE_NAMESPACE }}/fragalysis-backend:${{ steps.vars.outputs.tag }} ${{ steps.vars.outputs.BE_NAMESPACE }}/fragalysis-backend:stable - - name: Test - run: > - docker-compose -f docker-compose.test.yml up - --build - --exit-code-from tests - --abort-on-container-exit + - name: Test (docker compose) + uses: hoverkraft-tech/compose-action@v2.0.1 + with: + compose-file: ./docker-compose.test.yml + up-flags: --build --exit-code-from tests --abort-on-container-exit env: BE_NAMESPACE: ${{ steps.vars.outputs.BE_NAMESPACE }} BE_IMAGE_TAG: ${{ steps.vars.outputs.tag }} diff --git a/.github/workflows/build-staging.yaml b/.github/workflows/build-staging.yaml index 35b70ff0..d76fe21a 100644 --- a/.github/workflows/build-staging.yaml +++ b/.github/workflows/build-staging.yaml @@ -154,12 +154,11 @@ jobs: with: context: . tags: ${{ steps.vars.outputs.BE_NAMESPACE }}/fragalysis-backend:${{ steps.vars.outputs.tag }} - - name: Test - run: > - docker-compose -f docker-compose.test.yml up - --build - --exit-code-from tests - --abort-on-container-exit + - name: Test (docker compose) + uses: hoverkraft-tech/compose-action@v2.0.1 + with: + compose-file: ./docker-compose.test.yml + up-flags: --build --exit-code-from tests --abort-on-container-exit env: BE_NAMESPACE: ${{ steps.vars.outputs.BE_NAMESPACE }} BE_IMAGE_TAG: ${{ steps.vars.outputs.tag }} diff --git a/viewer/cset_upload.py b/viewer/cset_upload.py index 5db2ef7a..48d375f3 100644 --- a/viewer/cset_upload.py +++ b/viewer/cset_upload.py @@ -290,6 +290,19 @@ def create_mol(self, inchi, target, name=None) -> Compound: current_identifier=name, ) cpd.save() + # This is a new compound. + # We must now set relationships to the Proposal that it applies to. + # We do this by copying the relationships from the Target. + num_target_proposals = len(target.project_id.all()) + assert num_target_proposals > 0 + if num_target_proposals > 1: + logger.warning( + 'Compound Target %s has more than one Proposal (%d)', + target.title, + num_target_proposals, + ) + for project in target.project_set.all(): + cpd.project_id.add(project) except MultipleObjectsReturned as exc: # NB! when processing new uploads, Compound is always # fetched by inchi_key, so this shouldn't ever create @@ -339,7 +352,6 @@ def set_mol( smiles = Chem.MolToSmiles(mol) inchi = Chem.inchi.MolToInchi(mol) molecule_name = mol.GetProp('_Name') - version = mol.GetProp('version') compound: Compound = self.create_mol( inchi, compound_set.target, name=molecule_name @@ -449,8 +461,9 @@ def set_mol( existing_computed_molecules.append(k) if len(existing_computed_molecules) == 1: - logger.info( - 'Using existing ComputedMolecule %s', existing_computed_molecules[0] + logger.warning( + 'Using existing ComputedMolecule %s and overwriting its metadata', + existing_computed_molecules[0], ) computed_molecule = existing_computed_molecules[0] elif len(existing_computed_molecules) > 1: @@ -485,7 +498,6 @@ def set_mol( computed_molecule.pdb = lhs_so # TODO: this is wrong computed_molecule.pdb_info = pdb_info - computed_molecule.version = version # Extract possible reference URL and Rationale # URLs have to be valid URLs and rationals must contain more than one word ref_url: Optional[str] = ( diff --git a/viewer/download_structures.py b/viewer/download_structures.py index 0a25850c..d1a3d8ce 100644 --- a/viewer/download_structures.py +++ b/viewer/download_structures.py @@ -104,6 +104,25 @@ def __init__(self, category): # fmt: on +class UploadTagSubquery(Subquery): + """Annotate SiteObservation with tag of given category""" + + def __init__(self, category): + # fmt: off + query = SiteObservationTag.objects.filter( + pk=Subquery( + SiteObvsSiteObservationTag.objects.filter( + site_observation=OuterRef(OuterRef('pk')), + site_obvs_tag__category=TagCategory.objects.get( + category=category, + ), + ).values('site_obvs_tag')[:1] + ) + ).values('upload_name')[0:1] + super().__init__(query) + # fmt: on + + class CuratedTagSubquery(Exists): """Annotate SiteObservation with tag of given category""" @@ -140,6 +159,10 @@ class ArchiveFile: 'ligand_pdb': {}, 'ligand_mol': {}, 'ligand_smiles': {}, + # additional ccp4 files, issue 1448 + 'event_file_crystallographic': {}, + 'diff_file_crystallographic': {}, + 'sigmaa_file_crystallographic': {}, }, 'molecules': { 'sdf_files': {}, @@ -423,14 +446,34 @@ def _metadata_file_zip(ziparchive, target, site_observations): logger.info('+ Processing metadata') annotations = {} - values = ['code', 'longcode', 'cmpd__compound_code', 'smiles', 'downloaded'] - header = ['Code', 'Long code', 'Compound code', 'Smiles', 'Downloaded'] + values = [ + 'code', + 'longcode', + 'experiment__code', + 'cmpd__compound_code', + 'smiles', + 'canon_site_conf__canon_site__centroid_res', + 'downloaded', + ] + header = [ + 'Code', + 'Long code', + 'Experiment code', + 'Compound code', + 'Smiles', + 'Centroid res', + 'Downloaded', + ] for category in TagCategory.objects.filter(category__in=TAG_CATEGORIES): tag = f'tag_{category.category.lower()}' + upload_tag = f'upload_tag_{category.category.lower()}' values.append(tag) - header.append(category.category) + header.append(f'{category.category} alias') annotations[tag] = TagSubquery(category.category) + values.append(upload_tag) + header.append(f'{category.category} upload name') + annotations[upload_tag] = UploadTagSubquery(category.category) pattern = re.compile(r'\W+') # non-alphanumeric characters for tag in SiteObservationTag.objects.filter( @@ -812,6 +855,7 @@ def _create_structures_dict(site_obvs, protein_params, other_params): ) ) else: + # file not in upload archive_path = str(apath.joinpath(param)) afile = [ @@ -820,12 +864,38 @@ def _create_structures_dict(site_obvs, protein_params, other_params): archive_path=archive_path, ) ] + else: logger.warning('Unexpected param: %s', param) continue zip_contents['proteins'][param][so.code] = afile + # add additional ccp4 files (issue 1448) + ccps = ('sigmaa_file', 'diff_file', 'event_file') + if param in ccps: + # these only come from siteobservation object + model_attr = getattr(so, param) + if model_attr and model_attr != 'None': + apath = Path('aligned_files').joinpath(so.code) + ccp_path = Path(model_attr.name) + path = ccp_path.parent.joinpath( + f'{ccp_path.stem}_crystallographic{ccp_path.suffix}' + ) + archive_path = str( + apath.joinpath(path.parts[-1].replace(so.longcode, so.code)) + ) + + afile = [ + ArchiveFile( + path=str(path), + archive_path=archive_path, + ) + ] + zip_contents['proteins'][f'{param}_crystallographic'][ + so.code + ] = afile + zip_contents['molecules']['single_sdf_file'] = other_params['single_sdf_file'] zip_contents['molecules']['sdf_info'] = other_params['sdf_info'] diff --git a/viewer/migrations/0059_remove_computedmolecule_version.py b/viewer/migrations/0059_remove_computedmolecule_version.py new file mode 100644 index 00000000..c6baece9 --- /dev/null +++ b/viewer/migrations/0059_remove_computedmolecule_version.py @@ -0,0 +1,17 @@ +# Generated by Django 3.2.25 on 2024-07-10 08:31 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('viewer', '0058_auto_20240614_1016'), + ] + + operations = [ + migrations.RemoveField( + model_name='computedmolecule', + name='version', + ), + ] diff --git a/viewer/migrations/0060_canonsite_centroid_res.py b/viewer/migrations/0060_canonsite_centroid_res.py new file mode 100644 index 00000000..97150507 --- /dev/null +++ b/viewer/migrations/0060_canonsite_centroid_res.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.25 on 2024-07-29 12:50 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('viewer', '0059_remove_computedmolecule_version'), + ] + + operations = [ + migrations.AddField( + model_name='canonsite', + name='centroid_res', + field=models.TextField(null=True), + ), + ] diff --git a/viewer/models.py b/viewer/models.py index 85678ad0..606f562d 100644 --- a/viewer/models.py +++ b/viewer/models.py @@ -396,6 +396,7 @@ class CanonSite(Versionable, models.Model): canon_site_num = models.IntegerField( null=True, help_text="numeric canon site id (enumerated on creation)" ) + centroid_res = models.TextField(null=True) objects = models.Manager() filter_manager = CanonSiteDataManager() @@ -1027,7 +1028,6 @@ class ComputedMolecule(models.Model): max_length=255, help_text="Link to pdb file; user-uploaded pdb or pdb.experiment.pdb_info", ) - version = models.PositiveSmallIntegerField(null=False, default=1) def __str__(self) -> str: return f"{self.smiles}" diff --git a/viewer/target_loader.py b/viewer/target_loader.py index 8a3cbb15..37734ed4 100644 --- a/viewer/target_loader.py +++ b/viewer/target_loader.py @@ -52,9 +52,6 @@ # assemblies and xtalforms XTALFORMS_FILE = "assemblies.yaml" -# canon site tag names -CANON_SITES_FILE = "canonical_sites.yaml" - # target name, nothing else CONFIG_FILE = "config*.yaml" @@ -1084,6 +1081,7 @@ def process_canon_site( Incoming data format: : + centroid_res: conformer_site_ids: global_reference_dtag: reference_conformer_site_id: @@ -1107,6 +1105,9 @@ def process_canon_site( ) residues = extract(key="residues", return_type=list) + centroid_res = extract(key="centroid_res") + conf_sites_ids = extract(key="conformer_site_ids", return_type=list) + ref_conf_site_id = extract(key="reference_conformer_site_id") fields = { "name": canon_site_id, @@ -1115,11 +1116,9 @@ def process_canon_site( defaults = { "residues": residues, + "centroid_res": centroid_res, } - conf_sites_ids = extract(key="conformer_site_ids", return_type=list) - ref_conf_site_id = extract(key="reference_conformer_site_id") - index_data = { "ref_conf_site": ref_conf_site_id, "conformer_site_ids": conf_sites_ids, @@ -1475,10 +1474,9 @@ def process_bundle(self): config = self._load_yaml(config_file) meta = self._load_yaml(Path(upload_dir).joinpath(METADATA_FILE)) xtalforms_yaml = self._load_yaml(Path(upload_dir).joinpath(XTALFORMS_FILE)) - canon_sites_yaml = self._load_yaml(Path(upload_dir).joinpath(CANON_SITES_FILE)) # this is the last file to load. if any of the files missing, don't continue - if not any([meta, config, xtalforms_yaml, canon_sites_yaml]): + if not any([meta, config, xtalforms_yaml]): msg = "Missing files in uploaded data, aborting" raise FileNotFoundError(msg) @@ -1661,24 +1659,6 @@ def process_bundle(self): canon_sites=canon_sites_by_conf_sites, xtalforms=xtalform_objects, ) - # enumerate xtalform_sites. a bit trickier than others because - # requires alphabetic enumeration - last_xtsite = ( - XtalformSite.objects.filter( - pk__in=[ - k.instance.pk - for k in xtalform_sites_objects.values() # pylint: disable=no-member - ] - ) - .order_by("-xtalform_site_num")[0] - .xtalform_site_num - ) - - xtnum = alphanumerator(start_from=last_xtsite) - for val in xtalform_sites_objects.values(): # pylint: disable=no-member - if not val.instance.xtalform_site_num: - val.instance.xtalform_site_num = next(xtnum) - val.instance.save() # now can update CanonSite with ref_conf_site # also, fill the canon_site_num field @@ -1811,16 +1791,12 @@ def process_bundle(self): logger.debug("data read and processed, adding tags") - canon_name_tag_map = { - k: v["centroid_res"] if "centroid_res" in v.keys() else "UNDEFINED" - for k, v in canon_sites_yaml.items() - } - # tag site observations cat_canon = TagCategory.objects.get(category="CanonSites") for val in canon_site_objects.values(): # pylint: disable=no-member prefix = val.instance.canon_site_num - tag = canon_name_tag_map.get(val.versioned_key, "UNDEFINED") + # tag = canon_name_tag_map.get(val.versioned_key, "UNDEFINED") + tag = val.versioned_key so_list = SiteObservation.objects.filter( canon_site_conf__canon_site=val.instance ) @@ -1839,12 +1815,10 @@ def process_bundle(self): f"{val.instance.canon_site.canon_site_num}" + f"{next(numerators[val.instance.canon_site.canon_site_num])}" ) - tag = val.instance.name.split('+')[0] + # tag = val.instance.name.split('+')[0] + tag = val.instance.name so_list = [ - site_observation_objects[k].instance - for k in val.index_data["members"] - # site_observations_versioned[k] - # for k in val.index_data["members"] + site_observation_objects[k].instance for k in val.index_data["members"] ] self._tag_observations( tag, prefix, category=cat_conf, site_observations=so_list, hidden=True @@ -1880,13 +1854,66 @@ def process_bundle(self): logger.debug("xtalform objects tagged") + # enumerate xtalform_sites. a bit trickier than others because + # requires alphabetic enumeration starting from the letter of + # the chain and following from there + + # sort the dictionary + # fmt: off + xtls_sort_qs = XtalformSite.objects.filter( + pk__in=[k.instance.pk for k in xtalform_sites_objects.values() ], # pylint: disable=no-member + ).annotate( + obvs=Count("canon_site__canonsiteconf__siteobservation", default=0), + ).order_by("-obvs", "xtalform_site_id") + # ordering by xtalform_site_id is not strictly necessary, but + # makes the sorting consistent + + # fmt: on + + _xtalform_sites_objects = {} + for xtl in xtls_sort_qs: + key = f"{xtl.xtalform_site_id}/{xtl.version}" + _xtalform_sites_objects[key] = xtalform_sites_objects[ + key + ] # pylint: disable=no-member + + if self.version_number == 1: + # first upload, use the chain letter + xtnum = alphanumerator( + start_from=xtls_sort_qs[0].lig_chain.lower(), drop_first=False + ) + else: + # subsequent upload, just use the latest letter as starting point + # fmt: off + last_xtsite = XtalformSite.objects.filter( + pk__in=[ + k.instance.pk + for k in _xtalform_sites_objects.values() # pylint: disable=no-member + ] + ).order_by( + "-xtalform_site_num" + )[0].xtalform_site_num + # fmt: on + xtnum = alphanumerator(start_from=last_xtsite) + + # this should be rare, as Frank said, all crystal-related + # issues should be resolved by the time of the first + # upload. In fact, I'll mark this momentous occasion here: + logger.warning("New XtalformSite objects added in subsequent uploads") + + for val in _xtalform_sites_objects.values(): # pylint: disable=no-member + if not val.instance.xtalform_site_num: + val.instance.xtalform_site_num = next(xtnum) + val.instance.save() + cat_xtalsite = TagCategory.objects.get(category="CrystalformSites") - for val in xtalform_sites_objects.values(): # pylint: disable=no-member + for val in _xtalform_sites_objects.values(): # pylint: disable=no-member prefix = ( f"F{val.instance.xtalform.xtalform_num}" + f"{val.instance.xtalform_site_num}" ) - tag = f"{val.instance.xtalform.name} - {val.instance.xtalform_site_id}" + # tag = val.instance.xtalform_site_id + tag = val.versioned_key so_list = [ site_observation_objects[k].instance for k in val.index_data["residues"] ] @@ -1966,7 +1993,9 @@ def _extract( def _generate_poses(self): values = ["canon_site_conf__canon_site", "cmpd"] # fmt: off - pose_groups = SiteObservation.objects.exclude( + pose_groups = SiteObservation.filter_manager.by_target( + self.target, + ).exclude( canon_site_conf__canon_site__isnull=True, ).exclude( cmpd__isnull=True, @@ -1999,16 +2028,23 @@ def _generate_poses(self): pose.save() except MultipleObjectsReturned: # must be a follow-up upload. create new pose, but - # only add observatons that are not yet assigned + # only add observatons that are not yet assigned (if + # these exist) pose_items = pose_items.filter(pose__isnull=True) - sample = pose_items.first() - pose = Pose( - canon_site=sample.canon_site_conf.canon_site, - compound=sample.cmpd, - main_site_observation=sample, - display_name=sample.code, - ) - pose.save() + if pose_items.exists(): + sample = pose_items.first() + pose = Pose( + canon_site=sample.canon_site_conf.canon_site, + compound=sample.cmpd, + main_site_observation=sample, + display_name=sample.code, + ) + pose.save() + else: + # I don't know if this can happen but this (due to + # other bugs) is what allowed me to find this + # error. Make a note in the logs. + logger.warning("No observations left to assign to pose") # finally add observations to the (new or existing) pose for obvs in pose_items: diff --git a/viewer/utils.py b/viewer/utils.py index e170a0ee..8bfe405f 100644 --- a/viewer/utils.py +++ b/viewer/utils.py @@ -409,8 +409,17 @@ def restore_curated_tags(filename: str) -> None: logger.error(exc) -def alphanumerator(start_from: str = "") -> Generator[str, None, None]: - """Return alphabetic generator (A, B .. AA, AB...) starting from a specified point.""" +def alphanumerator( + start_from: str = "", drop_first: bool = True +) -> Generator[str, None, None]: + """Return alphabetic generator (A, B .. AA, AB...) starting from a specified point. + + drop_first - as per workflow, usually it's given the last letter + of previous sequence so the the next in the pipeline should be + start_from + 1. drop_first = False indicates this is not necessary + and start_from will be the first the iterator produces + + """ # since product requries finite maximum return string length set # to 10 characters. that should be enough for fragalysis (and to @@ -426,7 +435,8 @@ def alphanumerator(start_from: str = "") -> Generator[str, None, None]: if start_from is not None and start_from != '': start_from = start_from.lower() generator = itertools.dropwhile(lambda x: x != start_from, generator) # type: ignore[assignment] - # and drop one more, then it starts from after the start from as it should - _ = next(generator) + if drop_first: + # drop one more, then it starts from after the start from as it should + _ = next(generator) return generator