Store header and footer as datasets not attributes

This fixes a problem with very long footers caused by padded image channels; see #9. Also stores the names of present channels as "AINames", see #10
clbarnes · Feb 16, 2023 · 6f912da · 6f912da
1 parent f469c46
commit 6f912da
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 73 deletions.
diff --git a/README.md b/README.md
@@ -41,6 +41,9 @@ Additionally, date fields (listed and serialised according to the format specifi
 e.g. `{"SWdate": "02/01/2023"}` (ambiguous, locale-dependent, not sortable) would be represented as
 `{"SWdate": "02/01/2023", "SWdate__iso": "2023-01-02"}` (internationally standardised) in the output.
 
+Depending on the file, certain channels may or may not be present.
+The names of existing channels are stored in an attribute called `AINames`.
+
 Jeiss microscopes can output CSV files with additional metadata.
 This metadata can be stored as attributes on an empty group called `additional_metadata` in the output HDF5,
 so long as the correct row can be found based on the .dat file's acquisition date.
@@ -59,19 +62,19 @@ usage: dat2hdf5 [-h] [-m] [-c CHUNKS] [-z COMPRESSION] [-B] [-o] [-f]
 
 Convert a Jeiss FIBSEM .dat file into a standard HDF5 group (which may be the
 container's root), preserving all known metadata as group attributes.
-Additionally stores the raw header and footer bytes as uint8 arrays (under
-keys "_header" and "_footer" respectively), the version string of the
-conversion tool ("_dat2hdf5_version"). If the full contents of the .dat were
-written to HDF5 successfully, the field "_conversion_complete" will exist and
-be True. The length of the original .dat file is stored in "_dat_nbytes", and
-its filename or path can optionally be written in "_dat_filename". Each
-channel which exists is stored as a dataset within the group named "AI1",
-"AI2", ..., based on the original base-1 channel index ("AI" stands for
-"Analogue Input"). Channel datasets optionally store the minimum and maximum
-values as attributes "min" and "max". Datasets may optionally be chunked,
-compressed, and/or have other filters applied. Lastly, additional metadata
-from a CSV indexed by acquisition date and time can be included as attributes
-on an empty "additional_metadata" subgroup.
+Additionally stores the version string of the conversion tool
+("_dat2hdf5_version"). If the full contents of the .dat were written to HDF5
+successfully, the field "_conversion_complete" will exist and be True. The
+length of the original .dat file is stored in "_dat_nbytes", and its filename
+or path can optionally be written in "_dat_filename". Each channel which
+exists is stored as a dataset within the group named "AI1", "AI2", ..., based
+on the original base-1 channel index ("AI" stands for "Analogue Input").
+Channel datasets optionally store the minimum and maximum values as attributes
+"min" and "max". Datasets may optionally be chunked, compressed, and/or have
+other filters applied. Also stores the raw header and footer bytes as uint8
+array datasets (under names "_header" and "_footer" respectively), Lastly,
+additional metadata from a CSV indexed by acquisition date and time can be
+included as attributes on an empty "additional_metadata" subgroup.
 
 positional arguments:
   dat                   Path to a .dat file

diff --git a/jeiss_convert/constants.py b/jeiss_convert/constants.py
@@ -4,8 +4,9 @@
 DATASET_PREFIX = "AI"
 
 VERSION_FIELD = "_dat2hdf5_version"
-HEADER_FIELD = "_header"
-FOOTER_FIELD = "_footer"
+HEADER_DS = "_header"
+FOOTER_DS = "_footer"
 CONVERSION_COMPLETE_FIELD = "_conversion_complete"
 DAT_NBYTES_FIELD = "_dat_nbytes"
 DAT_FILENAME_FIELD = "_dat_filename"
+CHANNEL_NAMES_FIELD = "AINames"
diff --git a/jeiss_convert/convert.py b/jeiss_convert/convert.py
@@ -3,9 +3,7 @@
 Convert a Jeiss FIBSEM .dat file into a standard HDF5 group
 (which may be the container's root),
 preserving all known metadata as group attributes.
-Additionally stores the raw header and footer bytes as uint8 arrays
-(under keys "_header" and "_footer" respectively),
-the version string of the conversion tool ("_dat2hdf5_version").
+Additionally stores the version string of the conversion tool ("_dat2hdf5_version").
 If the full contents of the .dat were written to HDF5 successfully,
 the field "_conversion_complete" will exist and be True.
 The length of the original .dat file is stored in "_dat_nbytes",
@@ -17,6 +15,8 @@
 Channel datasets optionally store the minimum and maximum values
 as attributes "min" and "max".
 Datasets may optionally be chunked, compressed, and/or have other filters applied.
+Also stores the raw header and footer bytes as uint8 array datasets
+(under names "_header" and "_footer" respectively),
 
 Lastly, additional metadata from a CSV indexed by acquisition date and time
 can be included as attributes on an empty "additional_metadata" subgroup.

diff --git a/jeiss_convert/hdf5.py b/jeiss_convert/hdf5.py
@@ -3,9 +3,19 @@
 from pathlib import Path
 
 import h5py
-
-from .constants import CONVERSION_COMPLETE_FIELD, DAT_FILENAME_FIELD, SUBGROUP_NAME
-from .utils import group_to_bytes, split_channels
+import numpy as np
+
+from .constants import (
+    CHANNEL_NAMES_FIELD,
+    CONVERSION_COMPLETE_FIELD,
+    DAT_FILENAME_FIELD,
+    FOOTER_DS,
+    HEADER_DS,
+    SUBGROUP_NAME,
+    VERSION_FIELD,
+)
+from .utils import ParsedData, get_channel_names, group_to_bytes
+from .version import __version__
 
 logger = logging.getLogger(__name__)
 
@@ -49,7 +59,13 @@ def dat_to_hdf5(
         Integer to fill out image channels with if truncated.
         If None (default), error instead.
     """
-    meta, channel_names, data = split_channels(dat_path, fill=fill)
+    all_data = ParsedData.from_file(dat_path, fill=fill)
+    meta = all_data.meta.copy()
+    data = all_data.data
+
+    channel_names = get_channel_names(meta)
+    meta[CHANNEL_NAMES_FIELD] = channel_names
+    meta[VERSION_FIELD] = __version__
 
     if ds_kwargs is None:
         ds_kwargs = dict()
@@ -67,6 +83,8 @@ def dat_to_hdf5(
         if filename is not None:
             g.attrs[DAT_FILENAME_FIELD] = filename
 
+        g.create_dataset(HEADER_DS, data=np.frombuffer(all_data.header, dtype="uint8"))
+
         for idx, ds_name in enumerate(channel_names):
             arr = data[idx]
             ds = g.create_dataset(ds_name, data=arr, **ds_kwargs)
@@ -75,6 +93,9 @@ def dat_to_hdf5(
                 ds.attrs["min"] = arr.min()
                 ds.attrs["max"] = arr.max()
 
+        # todo: store offset at which recipe starts?
+        g.create_dataset(FOOTER_DS, data=np.frombuffer(all_data.footer, dtype="uint8"))
+
         if additional_metadata is not None:
             g2 = g.create_group(SUBGROUP_NAME)
             g2.attrs.update(additional_metadata)

diff --git a/jeiss_convert/utils.py b/jeiss_convert/utils.py
@@ -3,6 +3,7 @@
 import logging
 import sys
 import typing as tp
+from collections.abc import Mapping
 from io import BytesIO
 from pathlib import Path
 
@@ -11,10 +12,9 @@
 
 from jeiss_convert.constants import (
     DAT_NBYTES_FIELD,
-    FOOTER_FIELD,
-    HEADER_FIELD,
+    FOOTER_DS,
+    HEADER_DS,
     ISO_DATE_SUFFIX,
-    VERSION_FIELD,
 )
 
 from .constants import DATASET_PREFIX, ENUM_NAME_SUFFIX
@@ -27,7 +27,6 @@
     HEADER_LENGTH,
     SPEC_DIR,
 )
-from .version import version
 
 if sys.version_info < (3, 11):
     from backports.strenum import StrEnum
@@ -263,7 +262,7 @@ def parse_file(fpath: Path, name_enums=DEFAULT_NAME_ENUMS):
         return parse_bytes(b, name_enums=name_enums)
 
 
-def write_header(data: dict[str, tp.Any]):
+def write_header(data: Mapping[str, tp.Any]):
     buffer = BytesIO(b"\0" * HEADER_LENGTH)
     for name, dtype, offset, _ in SPECS[data["FileVersion"]]:
         item = data[name]
@@ -279,8 +278,8 @@ def write_header(data: dict[str, tp.Any]):
 class ParsedData(tp.NamedTuple):
     meta: dict[str, tp.Any]
     data: np.ndarray
-    header: tp.Optional[bytes] = None
-    footer: tp.Optional[bytes] = None
+    header: bytes = b""
+    footer: bytes = b""
 
     @classmethod
     def from_bytes(
@@ -296,10 +295,7 @@ def from_bytes(
         data = read_value(b, dtype, HEADER_LENGTH, shape, fill=fill)
 
         footer_starts = int(HEADER_LENGTH + data.nbytes)
-        if footer_starts >= len(b):
-            footer = None
-        else:
-            footer = b[footer_starts:]
+        footer = b[footer_starts:]
 
         return cls(meta, data, header, footer)
 
@@ -313,16 +309,6 @@ def from_file(
         with open(fpath, "rb") as f:
             return cls.from_bytes(f.read(), name_enums=name_enums, fill=fill)
 
-    def header_hex(self) -> tp.Optional[str]:
-        if self.header is None:
-            return None
-        return self.header.hex()
-
-    def footer_hex(self) -> tp.Optional[str]:
-        if self.footer is None:
-            return None
-        return self.footer.hex()
-
 
 def metadata_to_jso(meta: dict[str, tp.Any]) -> dict[str, tp.Any]:
     file_ver = meta["FileVersion"]
@@ -342,52 +328,41 @@ def metadata_to_numpy(meta: dict[str, tp.Any]) -> dict[str, tp.Any]:
     return out
 
 
-def split_channels(
-    dat_path: Path,
-    json_metadata=False,
-    name_enums=DEFAULT_NAME_ENUMS,
-    fill=None,
-) -> tuple[dict[str, tp.Any], list[str], np.ndarray]:
-    all_data = ParsedData.from_file(dat_path, name_enums=name_enums, fill=fill)
+def get_channel_names(meta):
     channel_names = []
     for input_id in range(1, 5):
         ds = f"{DATASET_PREFIX}{input_id}"
 
-        if all_data.meta[ds]:
+        if meta[ds]:
             channel_names.append(ds)
 
-    if json_metadata:
-        meta = metadata_to_jso(all_data.meta)
-        if all_data.header is not None:
-            meta[HEADER_FIELD] = all_data.header.hex()
-        if all_data.footer is not None:
-            meta[FOOTER_FIELD] = all_data.footer.hex()
-    else:
-        meta = all_data.meta
-        if all_data.header is not None:
-            meta[HEADER_FIELD] = np.frombuffer(all_data.header, "uint8")
-        if all_data.footer is not None:
-            meta[FOOTER_FIELD] = np.frombuffer(all_data.footer, "uint8")
+    return channel_names
 
-    meta[VERSION_FIELD] = version
-    return meta, channel_names, all_data.data
 
-
-def get_bytes(d: dict[str, tp.Any], key: str):
-    val = d.get(key)
+def as_bytes(val: tp.Optional[tp.Union[np.ndarray, str, bytes]]) -> bytes:
     if val is None:
         return b""
-
-    if isinstance(val, str):
+    elif isinstance(val, bytes):
+        return val
+    elif isinstance(val, str):
         return bytes.fromhex(val)
-    elif isinstance(val, np.ndarray):
+
+    if isinstance(val, h5py.Dataset):
+        val = val[:]
+
+    if isinstance(val, np.ndarray):
         return val.tobytes()
+
     raise ValueError(
         "Expected str (hex-encoded) or numpy array "
         f"to convert into bytes, got {type(val)}"
     )
 
 
+def get_bytes(d: dict[str, tp.Any], key: str):
+    return as_bytes(d.get(key))
+
+
 def md5sum(b: bytes):
     md5 = hashlib.md5()
     md5.update(b)
@@ -403,13 +378,14 @@ def group_to_bytes(g: h5py.Group, json_metadata=False, check_header=True):
 
     header = write_header(meta)
     if check_header:
-        stored_header = get_bytes(meta, HEADER_FIELD)
+        stored_header = as_bytes(g.get(HEADER_DS))
         if stored_header and md5sum(stored_header) != md5sum(header):
             raise RuntimeError(
                 f"Stored header (length {len(stored_header)}) is different to "
                 f"calculated header (length {len(header)})"
             )
-    footer = get_bytes(meta, FOOTER_FIELD)
+
+    footer = as_bytes(g.get(FOOTER_DS))
 
     to_stack = []
     for input_id in range(1, 5):