updates to chunking, only copy 1st level data

khanlab · Jan 13, 2025 · aa1e285 · aa1e285
1 parent dcfb77d
commit aa1e285
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 23 deletions.
diff --git a/config/samples.tsv b/config/samples.tsv
@@ -1,2 +1,2 @@
 subject	sample	acq	stain_0	stain_1	stain_2	sample_path
-brown	brain	imaris	Lectin	PI	Abeta	/cifs/trident/projects/Brown/241129_AMR_1wk_App_tau_E3_34-3_M2_1x1_09-57-23/09-57-23_AMR_1wk_App_tau_E3_34-3_M2_1x1_Blaze_C00_xyz-Table Z0000.ome.ims
+brown	brain	imaris	Iba1	GFAP	YOPRO	/cifs/trident/projects/Brown/241129_AMR_1wk_App_tau_E3_34-3_M2_1x1_09-57-23/09-57-23_AMR_1wk_App_tau_E3_34-3_M2_1x1_Blaze_C00_xyz-Table Z0000.ome.ims
diff --git a/workflow/scripts/imaris_to_ome_zarr.py b/workflow/scripts/imaris_to_ome_zarr.py
@@ -11,43 +11,54 @@
 from lib.cloud_io import get_fsspec, is_remote
 
 
-def convert_hdf5_to_zarr(hdf5_path, zarr_path):
+def convert_hdf5_to_zarr(hdf5_path, zarr_path,chunks):
     """
     Convert an HDF5 file to Zarr using h5py and zarr.
 
     Parameters:
         hdf5_path (str): Path to the input HDF5 (.ims) file.
         zarr_path (str): Path to the output Zarr dataset.
     """
+
+
     # Open the HDF5 file and create a Zarr root group
     with h5py.File(hdf5_path, "r") as hdf5_file:
         zarr_store = zarr.open_group(zarr_path, mode="w")
 
-        def copy_group(hdf5_group, zarr_group):
-            for key, item in hdf5_group.items():
-                if isinstance(item, h5py.Group):  # Recursively copy groups
-                    new_group = zarr_group.create_group(key)
-                    copy_group(item, new_group)
-                elif isinstance(item, h5py.Dataset):  # Copy datasets
-                    zarr_group.create_dataset(
-                        name=key,
-                        data=item[()],
-                        chunks=item.chunks,
-                        dtype=item.dtype,
-                        compression="blosc"  # Optional compression
-                    )
-                    print(f"Copied dataset: {key}")
-
-        # Start copying from the root group
-        copy_group(hdf5_file, zarr_store)
+        # Define the specific path to copy
+        target_path = "DataSet/ResolutionLevel 0/TimePoint 0"
+
+        # Check if the target path exists in HDF5
+        if target_path in hdf5_file:
+            hdf5_group = hdf5_file[target_path]
+
+            def copy_group(hdf5_group, zarr_group):
+                for key, item in hdf5_group.items():
+                    if isinstance(item, h5py.Group) and key.startswith("Channel"):  # Only copy Channel groups
+                        channel_group = item
+                        if "Data" in channel_group:  # Only copy the Data dataset in each Channel
+                            data_item = channel_group["Data"]
+                            zarr_group.create_dataset(
+                                name=key + "/Data",  # Store Data in the Channel group
+                                data=data_item[()],
+                                chunks=chunks,
+                                dtype=data_item.dtype,
+                                compression="blosc"  # Optional compression
+                            )
+                            print(f"Copied Data dataset for {key}")
+                    # No need to copy other groups or datasets, as we're only interested in 'Data'
+
+            # Start copying only the Channel groups
+            copy_group(hdf5_group, zarr_store)
 
-    print(f"Converted HDF5 file to Zarr at: {zarr_path}")
 
+rechunk_size=snakemake.params.rechunk_size
 
 #copy imaris (hdf5) to zarr -- TODO: don't need to copy everything 
 convert_hdf5_to_zarr(
     hdf5_path=snakemake.input.ims,
     zarr_path='copy_hdf5.zarr',
+    chunks=rechunk_size
 )
 
 
@@ -56,7 +67,6 @@ def copy_group(hdf5_group, zarr_group):
 metadata_json=snakemake.input.metadata_json
 downsampling=snakemake.params.downsampling
 max_layer=snakemake.params.max_downsampling_layers #number of downsamplings by 2 to include in zarr
-rechunk_size=snakemake.params.rechunk_size
 out_zarr=snakemake.output.zarr
 stains=snakemake.params.stains
 scaling_method=snakemake.params.scaling_method
@@ -105,8 +115,7 @@ def copy_group(hdf5_group, zarr_group):
 for zarr_i,stain in enumerate(stains):
     #open zarr to get group name
     zi = zarr.open(in_zarr)
-#    darr_list.append(da.from_zarr(in_zarr,component=f'DataSet/ResolutionLevel 0/TimePoint 0/Channel {zarr_i}/Data',chunks=rechunk_size))
-    darr_list.append(da.from_zarr(in_zarr,component=f'DataSet/ResolutionLevel 0/TimePoint 0/Channel {zarr_i}/Data'))
+    darr_list.append(da.from_zarr(in_zarr,component=f'Channel {zarr_i}/Data'))
 
 
     #append to omero metadata