updated to read chunk-by-chunk

khanlab · Jan 13, 2025 · abd033d · abd033d
1 parent aa1e285
commit abd033d
Showing 1 changed file with 34 additions and 10 deletions.
diff --git a/workflow/scripts/imaris_to_ome_zarr.py b/workflow/scripts/imaris_to_ome_zarr.py
@@ -11,16 +11,16 @@
 from lib.cloud_io import get_fsspec, is_remote
 
 
-def convert_hdf5_to_zarr(hdf5_path, zarr_path,chunks):
+def convert_hdf5_to_zarr(hdf5_path, zarr_path, chunks):
     """
-    Convert an HDF5 file to Zarr using h5py and zarr.
+    Convert an HDF5 file to Zarr using h5py and zarr, handling chunked copying.
 
     Parameters:
         hdf5_path (str): Path to the input HDF5 (.ims) file.
         zarr_path (str): Path to the output Zarr dataset.
+        chunks (tuple): Chunk size for the Zarr dataset.
     """
 
-
     # Open the HDF5 file and create a Zarr root group
     with h5py.File(hdf5_path, "r") as hdf5_file:
         zarr_store = zarr.open_group(zarr_path, mode="w")
@@ -33,28 +33,52 @@ def convert_hdf5_to_zarr(hdf5_path, zarr_path,chunks):
             hdf5_group = hdf5_file[target_path]
 
             def copy_group(hdf5_group, zarr_group):
+                """
+                Copies channel groups and their 'Data' datasets chunk by chunk.
+
+                Args:
+                    hdf5_group: HDF5 group containing the dataset.
+                    zarr_group: Zarr group to write the dataset to.
+                """
                 for key, item in hdf5_group.items():
                     if isinstance(item, h5py.Group) and key.startswith("Channel"):  # Only copy Channel groups
                         channel_group = item
                         if "Data" in channel_group:  # Only copy the Data dataset in each Channel
                             data_item = channel_group["Data"]
-                            zarr_group.create_dataset(
-                                name=key + "/Data",  # Store Data in the Channel group
-                                data=data_item[()],
+
+                            # Create the Zarr dataset
+                            zarr_dataset = zarr_group.require_dataset(
+                                name=key + "/Data",
+                                shape=data_item.shape,
                                 chunks=chunks,
                                 dtype=data_item.dtype,
-                                compression="blosc"  # Optional compression
+                                compression="blosc",  # Optional compression
                             )
-                            print(f"Copied Data dataset for {key}")
-                    # No need to copy other groups or datasets, as we're only interested in 'Data'
+
+                            # Copy data chunk by chunk
+                            for i_start in range(0, data_item.shape[0], chunks[0]):
+                                for j_start in range(0, data_item.shape[1], chunks[1]):
+                                    for k_start in range(0, data_item.shape[2], chunks[2]):
+                                        i_end = min(i_start + chunks[0], data_item.shape[0])
+                                        j_end = min(j_start + chunks[1], data_item.shape[1])
+                                        k_end = min(k_start + chunks[2], data_item.shape[2])
+
+                                        slices = (
+                                            slice(i_start, i_end),
+                                            slice(j_start, j_end),
+                                            slice(k_start, k_end),
+                                        )
+#                                        print(f"Copying slice {slices} for {key}")
+                                        zarr_dataset[slices] = data_item[slices]
 
             # Start copying only the Channel groups
             copy_group(hdf5_group, zarr_store)
 
 
+
 rechunk_size=snakemake.params.rechunk_size
 
-#copy imaris (hdf5) to zarr -- TODO: don't need to copy everything 
+#copy imaris (hdf5) to zarr
 convert_hdf5_to_zarr(
     hdf5_path=snakemake.input.ims,
     zarr_path='copy_hdf5.zarr',