diff --git a/src/scitacean/client.py b/src/scitacean/client.py index a9a844a1..013df78d 100644 --- a/src/scitacean/client.py +++ b/src/scitacean/client.py @@ -291,7 +291,8 @@ def upload_new_dataset_now(self, dataset: Dataset) -> Dataset: with_new_pid = dataset.replace(_read_only={"pid": finalized_model.pid}) finalized_orig_datablocks = self._upload_orig_datablocks( - with_new_pid.make_datablock_upload_models().orig_datablocks + with_new_pid.make_datablock_upload_models().orig_datablocks, + with_new_pid.pid, # type: ignore[arg-type] ) finalized_attachments = self._upload_attachments_for_dataset( with_new_pid.make_attachment_upload_models(), @@ -333,20 +334,22 @@ def upload_new_sample_now(self, sample: model.Sample) -> model.Sample: return model.Sample.from_download_model(finalized_model) def _upload_orig_datablocks( - self, orig_datablocks: list[model.UploadOrigDatablock] | None + self, + orig_datablocks: list[model.UploadOrigDatablock] | None, + dataset_id: PID, ) -> list[model.DownloadOrigDatablock]: if not orig_datablocks: return [] try: return [ - self.scicat.create_orig_datablock(orig_datablock) + self.scicat.create_orig_datablock(orig_datablock, dataset_id=dataset_id) for orig_datablock in orig_datablocks ] except ScicatCommError as exc: raise RuntimeError( "Failed to upload original datablocks for SciCat dataset " - f"{orig_datablocks[0].datasetId}:" + f"{dataset_id}:" f"\n{exc.args}\nThe dataset and data files were successfully uploaded " "but are not linked with each other. Please fix the dataset manually!" ) from exc @@ -862,7 +865,10 @@ def create_dataset_model( ) def create_orig_datablock( - self, dblock: model.UploadOrigDatablock + self, + dblock: model.UploadOrigDatablock, + *, + dataset_id: PID, ) -> model.DownloadOrigDatablock: """Create a new orig datablock in SciCat. @@ -878,6 +884,8 @@ def create_orig_datablock( ---------- dblock: Model of the orig datablock to create. + dataset_id: + PID of the dataset that this datablock belongs to. Raises ------ @@ -887,7 +895,7 @@ def create_orig_datablock( """ uploaded = self._call_endpoint( cmd="post", - url="origdatablocks", + url=f"datasets/{quote_plus(str(dataset_id))}/origdatablocks", data=dblock, operation="create_orig_datablock", ) diff --git a/src/scitacean/datablock.py b/src/scitacean/datablock.py index 8792c609..34b0ea51 100644 --- a/src/scitacean/datablock.py +++ b/src/scitacean/datablock.py @@ -8,14 +8,9 @@ import dataclasses from collections.abc import Iterable, Iterator from datetime import datetime -from typing import TYPE_CHECKING from .file import File from .model import DownloadOrigDatablock, UploadOrigDatablock -from .pid import PID - -if TYPE_CHECKING: - from .dataset import Dataset # TODO Datablock @@ -32,13 +27,10 @@ class OrigDatablock: _files: list[File] = dataclasses.field(init=False) checksum_algorithm: str | None = None - instrument_group: str | None = None - owner_group: str | None = None init_files: dataclasses.InitVar[Iterable[File] | None] = None _access_groups: list[str] | None = None _created_at: datetime | None = None _created_by: str | None = None - _dataset_id: PID | None = None _id: str | None = None _is_published: bool | None = None _updated_at: datetime | None = None @@ -67,12 +59,9 @@ def from_download_model( dblock = orig_datablock_model return OrigDatablock( checksum_algorithm=dblock.chkAlg, - owner_group=dblock.ownerGroup, - instrument_group=dblock.instrumentGroup, _access_groups=dblock.accessGroups, _created_at=dblock.createdAt, _created_by=dblock.createdBy, - _dataset_id=orig_datablock_model.datasetId, _id=orig_datablock_model.id, _is_published=orig_datablock_model.isPublished, _updated_at=dblock.updatedAt, @@ -118,11 +107,6 @@ def updated_by(self) -> str | None: """User who last updated this datablock.""" return self._updated_by - @property - def dataset_id(self) -> PID | None: - """PID of the dataset this datablock belongs to.""" - return self._dataset_id - @property def datablock_id(self) -> str | None: """ID of this datablock.""" @@ -146,26 +130,16 @@ def add_files(self, *files: File) -> None: for f in files ) - def make_upload_model(self, dataset: Dataset) -> UploadOrigDatablock: + def make_upload_model(self) -> UploadOrigDatablock: """Build a new pydantic model to upload this datablock. - Parameters - ---------- - dataset: - The dataset that this orig datablock belongs to. - Returns ------- : A new model for this orig datablock. """ - owner_group = self.owner_group or dataset.owner_group return UploadOrigDatablock( chkAlg=self.checksum_algorithm, size=self.size, dataFileList=[file.make_model(for_archive=False) for file in self.files], - datasetId=dataset.pid, # type: ignore[arg-type] - ownerGroup=owner_group, - accessGroups=self.access_groups or dataset.access_groups, - instrumentGroup=self.instrument_group or dataset.instrument_group, ) diff --git a/src/scitacean/dataset.py b/src/scitacean/dataset.py index 0862d8ad..36430ba6 100644 --- a/src/scitacean/dataset.py +++ b/src/scitacean/dataset.py @@ -407,7 +407,7 @@ def add_orig_datablock(self, *, checksum_algorithm: str | None) -> OrigDatablock The newly added datablock. """ dblock = OrigDatablock( - checksum_algorithm=checksum_algorithm, _dataset_id=self.pid + checksum_algorithm=checksum_algorithm, ) self._orig_datablocks.append(dblock) return dblock @@ -470,7 +470,7 @@ def make_datablock_upload_models(self) -> DatablockUploadModels: return DatablockUploadModels(orig_datablocks=None) return DatablockUploadModels( orig_datablocks=[ - dblock.make_upload_model(self) for dblock in self._orig_datablocks + dblock.make_upload_model() for dblock in self._orig_datablocks ] ) diff --git a/src/scitacean/model.py b/src/scitacean/model.py index a62cdad4..cdc88510 100644 --- a/src/scitacean/model.py +++ b/src/scitacean/model.py @@ -340,12 +340,8 @@ def upload_model_type(cls) -> type[UploadOrigDatablock]: class UploadOrigDatablock(BaseModel): dataFileList: list[UploadDataFile] - datasetId: PID size: NonNegativeInt - accessGroups: list[str] | None = None chkAlg: str | None = None - instrumentGroup: str | None = None - ownerGroup: str | None = None @classmethod def download_model_type(cls) -> type[DownloadOrigDatablock]: diff --git a/src/scitacean/testing/backend/seed.py b/src/scitacean/testing/backend/seed.py index c5b0ceeb..4d8f188e 100644 --- a/src/scitacean/testing/backend/seed.py +++ b/src/scitacean/testing/backend/seed.py @@ -43,11 +43,9 @@ datasetName="My darkest magic yet", description="Doing some dark shit", isPublished=False, - numberOfFiles=2, numberOfFilesArchived=0, owner="PLACEHOLDER", ownerEmail="PLACE@HOLD.ER", - size=619, sourceFolder=RemotePath("/hex/data/123"), type=DatasetType.RAW, principalInvestigator="Ponder Stibbons", @@ -68,11 +66,9 @@ datasetName="Reprocessed dark magic", description="Making it even darker", isPublished=True, - numberOfFiles=1, numberOfFilesArchived=0, owner="PLACEHOLDER", ownerEmail="PLACE@HOLD.ER", - size=464, sourceFolder=RemotePath("/hex/data/dd"), type=DatasetType.DERIVED, investigator="Ponder Stibbons", @@ -92,11 +88,9 @@ datasetName="Shoe counter", description="Got all these shoes!", isPublished=True, - numberOfFiles=1, numberOfFilesArchived=0, owner="PLACEHOLDER", ownerEmail="PLACE@HOLD.ER", - size=64, sourceFolder=RemotePath("/hex/secret/stuff"), type=DatasetType.RAW, principalInvestigator="Mustrum Ridcully", @@ -133,10 +127,7 @@ _ORIG_DATABLOCKS: dict[str, list[UploadOrigDatablock]] = { "raw": [ UploadOrigDatablock( - datasetId=PID(pid="PLACEHOLDER"), - ownerGroup="PLACEHOLDER", size=619, - accessGroups=["uu", "faculty"], chkAlg="md5", dataFileList=[ UploadDataFile( @@ -162,10 +153,7 @@ ], "derived": [ UploadOrigDatablock( - datasetId=PID(pid="PLACEHOLDER"), - ownerGroup="PLACEHOLDER", size=464, - accessGroups=["uu", "faculty"], chkAlg="sha256", dataFileList=[ UploadDataFile( @@ -182,10 +170,7 @@ ], "public": [ UploadOrigDatablock( - datasetId=PID(pid="PLACEHOLDER"), - ownerGroup="PLACEHOLDER", size=64, - accessGroups=["uu"], chkAlg="md5", dataFileList=[ UploadDataFile( @@ -239,15 +224,6 @@ def _apply_config_dataset( return dset -def _apply_config_orig_datablock( - dblock: UploadOrigDatablock, dset: DownloadDataset, user: SciCatUser -) -> UploadOrigDatablock: - dblock = deepcopy(dblock) - dblock.ownerGroup = user.group - dblock.datasetId = dset.pid # type: ignore[assignment] - return dblock - - def _apply_config_attachment( attachment: UploadAttachment, user: SciCatUser ) -> UploadAttachment: @@ -282,20 +258,24 @@ def seed_database(*, client: Client, scicat_access: SciCatAccess) -> None: } INITIAL_DATASETS.update(download_datasets) - upload_orig_datablocks = { + download_orig_datablocks = { key: [ - _apply_config_orig_datablock( - dblock, download_datasets[key], scicat_access.user + client.scicat.create_orig_datablock( + dblock, + dataset_id=download_datasets[key].pid, # type: ignore[arg-type] ) for dblock in dblocks ] for key, dblocks in _ORIG_DATABLOCKS.items() } - download_orig_datablocks = { - key: [client.scicat.create_orig_datablock(dblock) for dblock in dblocks] - for key, dblocks in upload_orig_datablocks.items() - } INITIAL_ORIG_DATABLOCKS.update(download_orig_datablocks) + for key, dblocks in INITIAL_ORIG_DATABLOCKS.items(): + # Need to set these after uploading the datablocks to + # make sure that the database has the correct values. + INITIAL_DATASETS[key].numberOfFiles = sum( + len(dblock.dataFileList or ()) for dblock in dblocks + ) + INITIAL_DATASETS[key].size = sum(dblock.size or 0 for dblock in dblocks) upload_attachments = { key: [ diff --git a/src/scitacean/testing/client.py b/src/scitacean/testing/client.py index dadb5faf..55e8bddb 100644 --- a/src/scitacean/testing/client.py +++ b/src/scitacean/testing/client.py @@ -236,10 +236,9 @@ def create_dataset_model( @_conditionally_disabled def create_orig_datablock( - self, dblock: model.UploadOrigDatablock + self, dblock: model.UploadOrigDatablock, *, dataset_id: PID ) -> model.DownloadOrigDatablock: """Create a new orig datablock in SciCat.""" - dataset_id = dblock.datasetId if (dset := self.main.datasets.get(dataset_id)) is None: raise ScicatCommError(f"No dataset with id {dataset_id}") ingested = _process_orig_datablock(dblock, dset) @@ -357,6 +356,7 @@ def _process_orig_datablock( createdAt=created_at, updatedBy="fake", updatedAt=created_at, + datasetId=dset.pid, **fields, ) return processed diff --git a/tests/client/datablock_client_test.py b/tests/client/datablock_client_test.py index 0c53b989..c0c95d23 100644 --- a/tests/client/datablock_client_test.py +++ b/tests/client/datablock_client_test.py @@ -50,8 +50,6 @@ def orig_datablock(scicat_access): path="data.nxs", size=9235, time=parse_date("2023-08-18T13:52:33.000Z") ) ], - datasetId="PLACEHOLDER", - ownerGroup=scicat_access.user.group, ) @@ -64,8 +62,7 @@ def test_get_orig_datablock(scicat_client, key): def test_create_first_orig_datablock(scicat_client, derived_dataset, orig_datablock): uploaded = scicat_client.create_dataset_model(derived_dataset) - orig_datablock.datasetId = uploaded.pid - scicat_client.create_orig_datablock(orig_datablock) + scicat_client.create_orig_datablock(orig_datablock, dataset_id=uploaded.pid) downloaded = scicat_client.get_orig_datablocks(uploaded.pid) assert len(downloaded) == 1 downloaded = downloaded[0] diff --git a/tests/client/dataset_client_test.py b/tests/client/dataset_client_test.py index 0380619b..a41499d9 100644 --- a/tests/client/dataset_client_test.py +++ b/tests/client/dataset_client_test.py @@ -6,7 +6,7 @@ import pytest from dateutil.parser import parse as parse_date -from scitacean import PID, Client, RemotePath, ScicatCommError +from scitacean import PID, Client, Dataset, RemotePath, ScicatCommError from scitacean.client import ScicatClient from scitacean.model import ( DatasetType, @@ -141,3 +141,22 @@ def test_get_broken_dataset_strict_validation(real_client, require_scicat_backen dset = INITIAL_DATASETS["partially-broken"] with pytest.raises(pydantic.ValidationError): real_client.get_dataset(dset.pid, strict_validation=True) + + +def test_dataset_with_orig_datablock_roundtrip(client): + ds = Dataset.from_download_models( + INITIAL_DATASETS["raw"], INITIAL_ORIG_DATABLOCKS["raw"], [] + ).as_new() + # Unset fields that a raw dataset should not have but where initialized + # in the download model. + ds.input_datasets = None + ds.used_software = None + + # We don't need a file transfer because all files are on remote. + finalized = client.upload_new_dataset_now(ds) + downloaded = client.get_dataset(finalized.pid) + + assert downloaded.name == ds.name + assert downloaded.owner == ds.owner + assert downloaded.size == ds.size + assert downloaded.number_of_files == ds.number_of_files diff --git a/tests/dataset_test.py b/tests/dataset_test.py index 587784d0..c3da3f14 100644 --- a/tests/dataset_test.py +++ b/tests/dataset_test.py @@ -344,7 +344,6 @@ def test_make_scicat_models_datablock_with_one_file(dataset): block = blocks[0] assert block.size == 6163 - assert block.datasetId == dataset.pid assert block.dataFileList == [model.UploadDataFile(**file_model.model_dump())] diff --git a/tools/model-generation/spec/__init__.py b/tools/model-generation/spec/__init__.py index 25cd466c..43b53ac9 100644 --- a/tools/model-generation/spec/__init__.py +++ b/tools/model-generation/spec/__init__.py @@ -143,7 +143,7 @@ def user_dset_fields(self, manual: bool | None = None) -> list[DatasetField]: _SCHEMA_GROUPS = { "Attachment": ("CreateAttachmentDto", "Attachment"), - "OrigDatablock": ("CreateOrigDatablockDto", "OrigDatablock"), + "OrigDatablock": ("CreateDatasetOrigDatablockDto", "OrigDatablock"), "Datablock": ("CreateDatasetDatablockDto", "Datablock"), "Lifecycle": (None, "LifecycleClass"), "Technique": ("TechniqueClass", "TechniqueClass"),