Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #124

Merged
merged 3 commits into from
Sep 17, 2024
Merged

Dev #124

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
## [1.x.x] - 2024-xx-xx
## [1.1.5] - 2024-09-17

### Fix
- Accept `object` dtype for channel names (#114)

### Changed
- Update MACSima reader to read the channel names of the latest file format

## [1.1.4] - 2024-08-21

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "sopa"
version = "1.1.4"
version = "1.1.5"
description = "Spatial-omics pipeline and analysis"
documentation = "https://gustaveroussy.github.io/sopa"
homepage = "https://gustaveroussy.github.io/sopa"
Expand Down
4 changes: 2 additions & 2 deletions sopa/io/reader/cosmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from spatialdata.models import Image2DModel, PointsModel
from spatialdata_io._constants._constants import CosmxKeys

from .utils import _deduplicate_c_coords, _default_image_kwargs
from .utils import _deduplicate_names, _default_image_kwargs

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -138,7 +138,7 @@ def _read_fov_image(
protein_image, protein_names = _read_protein_fov(protein_path)
image = da.concatenate([image, protein_image], axis=0)

return image, _deduplicate_c_coords(morphology_coords + protein_names)
return image, _deduplicate_names(morphology_coords + protein_names)


def _read_fov_locs(path: Path, dataset_id: str) -> pd.DataFrame:
Expand Down
28 changes: 4 additions & 24 deletions sopa/io/reader/macsima.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
from __future__ import annotations

import logging
import re
from pathlib import Path

import pandas as pd
from spatialdata import SpatialData

from .utils import _deduplicate_names, _general_tif_directory_reader
from .utils import _general_tif_directory_reader

log = logging.getLogger(__name__)

Expand All @@ -16,31 +14,13 @@ def macsima(path: Path, **kwargs: int) -> SpatialData:
"""Read MACSIMA data as a `SpatialData` object

Notes:
For all dulicated name, their index will be added in brackets after, for instance you will often find `DAPI (000)` to indicate the DAPI channel of index `000`
For all dulicated name, their index will be added in brackets after, for instance you may find `DAPI (1)`.

Args:
path: Path to the directory containing the MACSIMA `.tif` images
kwargs: Kwargs for `_general_tif_directory_reader`
kwargs: Kwargs for the `_general_tif_directory_reader`

Returns:
A `SpatialData` object with a 2D-image of shape `(C, Y, X)`
"""
return _general_tif_directory_reader(path, files_to_channels=_get_channel_names_macsima, **kwargs)


def _parse_name_macsima(file):
index = file.name[2:5] if file.name[0] == "C" else file.name[:3]
match = re.search(r"_A-(.*?)_C-", file.name)
if match:
antibody = match.group(1)
channel = re.search(r"_C-(.*?)\.tif", file.name).group(1)
uid = f"{channel}-{index}"
else:
antibody = re.search(r"_A-(.*?)\.tif", file.name).group(1)
uid = index
return [antibody, uid]


def _get_channel_names_macsima(files):
df_antibodies = pd.DataFrame([_parse_name_macsima(file) for file in files])
return _deduplicate_names(df_antibodies)
return _general_tif_directory_reader(path, **kwargs)
10 changes: 1 addition & 9 deletions sopa/io/reader/phenocycler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from pathlib import Path

import dask.array as da
import pandas as pd
import tifffile as tf
from dask.delayed import delayed
from dask_image.imread import imread
Expand Down Expand Up @@ -39,7 +38,7 @@ def phenocycler(
if path.suffix == ".qptiff":
with tf.TiffFile(path) as tif:
series = tif.series[0]
names = _get_channel_names_qptiff(series)
names = _deduplicate_names([_get_channel_name_qptiff(page.description) for page in series])

delayed_image = delayed(lambda series: series.asarray())(tif)
image = da.from_delayed(delayed_image, dtype=series.dtype, shape=series.shape)
Expand Down Expand Up @@ -76,13 +75,6 @@ def _get_channel_name_qptiff(description):
return re.search(r"<Name>(.*?)</Name>", description).group(1)


def _get_channel_names_qptiff(page_series):
df_names = pd.DataFrame(
[[_get_channel_name_qptiff(page.description), str(i)] for i, page in enumerate(page_series)]
)
return _deduplicate_names(df_names)


def _get_IJ_channel_names(path: str) -> list[str]:
with tf.TiffFile(path) as tif:
default_names = [str(i) for i in range(len(tif.pages))]
Expand Down
28 changes: 11 additions & 17 deletions sopa/io/reader/utils.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from __future__ import annotations

import logging
from collections import defaultdict
from pathlib import Path
from typing import Callable

import dask.array as da
import numpy as np
import pandas as pd
import tifffile as tf
import xarray as xr
from dask_image.imread import imread
Expand All @@ -33,30 +33,24 @@ def _default_image_kwargs(
return image_models_kwargs, imread_kwargs


def _deduplicate_names(df):
is_duplicated = df[0].duplicated(keep=False)
df.loc[is_duplicated, 0] += " (" + df.loc[is_duplicated, 1] + ")"
return df[0].values
def _deduplicate_names(names: pd.Series | np.ndarray | list[str]) -> np.ndarray:
if not isinstance(names, pd.Series):
names = pd.Series(names)
names = names.astype(str)

duplicates = names.duplicated()
names[duplicates] += " (" + names.groupby(by=names).cumcount().astype(str)[duplicates] + ")"

def _deduplicate_c_coords(c_coords: list[str]) -> list[str]:
counter, res = defaultdict(int), []
for channel in c_coords:
if channel not in counter:
res.append(channel)
else:
res.append(f"{channel} ({counter[channel]})")
counter[channel] += 1
return res
return names.values


def _get_files_stem(files: list[Path]):
return [file.stem for file in files]
def _get_ome_channel_names(files):
return _deduplicate_names([_ome_channels_names(file)[0] for file in files])


def _general_tif_directory_reader(
path: str,
files_to_channels: Callable = _get_files_stem,
files_to_channels: Callable = _get_ome_channel_names,
suffix: str = ".tif",
image_models_kwargs: dict | None = None,
imread_kwargs: dict | None = None,
Expand Down
4 changes: 2 additions & 2 deletions sopa/io/standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .._constants import VALID_DIMENSIONS, SopaKeys
from .._sdata import get_spatial_image
from ..utils import _check_integer_dtype, get_channel_names, is_string_dtype
from ..utils import _check_integer_dtype, get_channel_names, valid_c_coords

log = logging.getLogger(__name__)

Expand All @@ -28,7 +28,7 @@ def sanity_check(sdata: SpatialData, delete_table: bool = False, warn: bool = Fa
)

c_coords = get_channel_names(image)
assert is_string_dtype(c_coords), f"Channel names must be strings, not {c_coords.dtype}"
assert valid_c_coords(c_coords), f"Channel names must be strings, not {c_coords.dtype}"

if SopaKeys.TABLE in sdata.tables:
if delete_table:
Expand Down
2 changes: 1 addition & 1 deletion sopa/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
get_channel_names,
scale_dtype,
string_channel_names,
is_string_dtype,
valid_c_coords,
_check_integer_dtype,
)
6 changes: 3 additions & 3 deletions sopa/utils/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ def get_channel_names(image: DataArray | DataTree) -> np.ndarray:
raise ValueError(f"Image must be a DataTree or a DataArray. Found: {type(image)}")


def is_string_dtype(c_coords: np.ndarray) -> bool:
return c_coords.dtype.kind in {"U", "S"}
def valid_c_coords(c_coords: np.ndarray) -> bool:
return c_coords.dtype.kind in {"U", "S", "O"}


def string_channel_names(sdata: SpatialData, default_single_channel: str = "DAPI"):
for key, image in list(sdata.images.items()):
c_coords = get_channel_names(image)

if is_string_dtype(c_coords):
if valid_c_coords(c_coords):
continue

c_coords = [str(i) for i in range(len(c_coords))]
Expand Down
2 changes: 1 addition & 1 deletion workflow/config/macsima/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ patchify:
segmentation:
cellpose:
diameter: 35
channels: ["DAPI (000)"]
channels: ["DAPI"]
flow_threshold: 2
cellprob_threshold: -6
min_area: 400
Expand Down
Loading