Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle path separators in the subset when exporting a datumaro dataset #1615

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## \[Q3 2024 Release 1.9.0\]
## \[Unreleased\]

### New features

### Enhancements
- Raise an appropriate error when exporting a datumaro dataset if its subset name contains path separators.
(<https://github.com/openvinotoolkit/datumaro/pull/1615>)

### Bug fixes

## Q3 2024 Release 1.9.0
### New features
- Add a new CLI command: datum format
(<https://github.com/openvinotoolkit/datumaro/pull/1570>)
Expand Down
2 changes: 2 additions & 0 deletions docs/source/docs/data-formats/formats/datumaro.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ A Datumaro dataset directory should have the following structure:
└── ...
```

Note that the subset name shouldn't contain path separators.

If your dataset is not following the above directory structure,
it cannot detect and import your dataset as the Datumaro format properly.

Expand Down
2 changes: 2 additions & 0 deletions docs/source/docs/data-formats/formats/datumaro_binary.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ A DatumaroBinary dataset directory should have the following structure:
└── ...
```

Note that the subset name shouldn't contain path separators.

If your dataset is not following the above directory structure,
it cannot detect and import your dataset as the DatumaroBinary format properly.

Expand Down
10 changes: 10 additions & 0 deletions src/datumaro/components/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,16 @@
return f"Item {self.item_id} is repeated in the source sequence."


@define(auto_exc=False)
class PathSeparatorInSubsetNameError(DatasetError):
subset: str = field()

def __str__(self):
return (

Check warning on line 350 in src/datumaro/components/errors.py

View check run for this annotation

Codecov / codecov/patch

src/datumaro/components/errors.py#L350

Added line #L350 was not covered by tests
f"Failed to export the subset '{self.subset}': subset name contains path separator(s)."
)


class DatasetQualityError(DatasetError):
pass

Expand Down
35 changes: 26 additions & 9 deletions src/datumaro/plugins/data_formats/datumaro/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from datumaro.components.crypter import NULL_CRYPTER
from datumaro.components.dataset_base import DatasetItem
from datumaro.components.dataset_item_storage import ItemStatus
from datumaro.components.errors import PathSeparatorInSubsetNameError
from datumaro.components.exporter import ExportContextComponent, Exporter
from datumaro.components.media import Image, MediaElement, PointCloud, Video, VideoFrame
from datumaro.util import cast, dump_json_file
Expand Down Expand Up @@ -185,7 +186,8 @@ def context_save_media(

if context.save_media:
fname = context.make_video_filename(item)
context.save_video(item, fname=fname, subdir=item.subset)
subdir = item.subset.replace(os.sep, "_") if item.subset else None
context.save_video(item, fname=fname, subdir=subdir)
item.media = Video(
path=fname,
step=video._step,
Expand All @@ -200,7 +202,8 @@ def context_save_media(

if context.save_media:
fname = context.make_video_filename(item)
context.save_video(item, fname=fname, subdir=item.subset)
subdir = item.subset.replace(os.sep, "_") if item.subset else None
context.save_video(item, fname=fname, subdir=subdir)
item.media = VideoFrame(Video(fname), video_frame.index)

yield
Expand All @@ -210,8 +213,9 @@ def context_save_media(

if context.save_media:
# Temporarily update image path and save it.
fname = context.make_image_filename(item)
context.save_image(item, encryption=encryption, fname=fname, subdir=item.subset)
fname = context.make_image_filename(item, name=str(item.id).replace(os.sep, "_"))
subdir = item.subset.replace(os.sep, "_") if item.subset else None
context.save_image(item, encryption=encryption, fname=fname, subdir=subdir)
item.media = Image.from_file(path=fname, size=image._size)

yield
Expand All @@ -220,14 +224,18 @@ def context_save_media(
pcd = item.media_as(PointCloud)

if context.save_media:
pcd_fname = context.make_pcd_filename(item)
context.save_point_cloud(item, fname=pcd_fname, subdir=item.subset)
pcd_name = str(item.id).replace(os.sep, "_")
pcd_fname = context.make_pcd_filename(item, name=pcd_name)
subdir = item.subset.replace(os.sep, "_") if item.subset else None
context.save_point_cloud(item, fname=pcd_fname, subdir=subdir)

extra_images = []
for i, extra_image in enumerate(pcd.extra_images):
extra_images.append(
Image.from_file(
path=context.make_pcd_extra_image_filename(item, i, extra_image)
path=context.make_pcd_extra_image_filename(
item, i, extra_image, name=f"{pcd_name}/extra_image_{i}"
)
)
)

Expand Down Expand Up @@ -507,18 +515,27 @@ def create_writer(
default_image_ext=self._default_image_ext,
)

if os.path.sep in subset:
raise PathSeparatorInSubsetNameError(subset)

return (
_SubsetWriter(
context=self,
subset=subset,
ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
ann_file=osp.join(
self._annotations_dir,
subset + self.PATH_CLS.ANNOTATION_EXT,
),
export_context=export_context,
)
if not self._stream
else _StreamSubsetWriter(
context=self,
subset=subset,
ann_file=osp.join(self._annotations_dir, subset + self.PATH_CLS.ANNOTATION_EXT),
ann_file=osp.join(
self._annotations_dir,
subset + self.PATH_CLS.ANNOTATION_EXT,
),
export_context=export_context,
)
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2023 Intel Corporation
# Copyright (C) 2024 Intel Corporation
#
# SPDX-License-Identifier: MIT

Expand All @@ -15,7 +15,7 @@

from datumaro.components.crypter import NULL_CRYPTER, Crypter
from datumaro.components.dataset_base import DatasetItem, IDataset
from datumaro.components.errors import DatumaroError
from datumaro.components.errors import DatumaroError, PathSeparatorInSubsetNameError
from datumaro.components.exporter import ExportContext, ExportContextComponent, Exporter
from datumaro.plugins.data_formats.datumaro.exporter import DatumaroExporter
from datumaro.plugins.data_formats.datumaro.exporter import _SubsetWriter as __SubsetWriter
Expand Down Expand Up @@ -309,6 +309,9 @@ def create_writer(
default_image_ext=self._default_image_ext,
)

if osp.sep in subset:
raise PathSeparatorInSubsetNameError(subset)

return _SubsetWriter(
context=self,
subset=subset,
Expand Down
185 changes: 185 additions & 0 deletions tests/unit/data_formats/datumaro/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,191 @@ def fxt_test_datumaro_format_dataset():
)


@pytest.fixture
def fxt_test_datumaro_format_dataset_with_path_separator():
label_categories = LabelCategories(attributes={"a", "b", "score"})
for i in range(5):
label_categories.add("cat" + str(i), attributes={"x", "y"})

mask_categories = MaskCategories(generate_colormap(len(label_categories.items)))

points_categories = PointsCategories()
for index, _ in enumerate(label_categories.items):
points_categories.add(index, ["cat1", "cat2"], joints=[[0, 1]])

sep = os.path.sep
return Dataset.from_iterable(
[
DatasetItem(
id="100/0",
subset=f"my{sep}train",
media=Image.from_numpy(data=np.ones((10, 6, 3))),
annotations=[
Caption("hello", id=1),
Caption("world", id=2, group=5),
Label(
2,
id=3,
attributes={
"x": 1,
"y": "2",
},
),
Bbox(
1,
2,
3,
4,
label=4,
id=4,
z_order=1,
attributes={
"score": 1.0,
},
),
Bbox(
5,
6,
7,
8,
id=5,
group=5,
attributes={
"a": 1.5,
"b": "text",
},
),
Points(
[1, 2, 2, 0, 1, 1],
label=0,
id=5,
z_order=4,
attributes={
"x": 1,
"y": "2",
},
),
Mask(
label=3,
id=5,
z_order=2,
image=np.ones((2, 3)),
attributes={
"x": 1,
"y": "2",
},
),
Ellipse(
5,
6,
7,
8,
label=3,
id=5,
z_order=2,
attributes={
"x": 1,
"y": "2",
},
),
Cuboid2D(
[
(1, 1),
(3, 1),
(3, 3),
(1, 3),
(1.5, 1.5),
(3.5, 1.5),
(3.5, 3.5),
(1.5, 3.5),
],
label=3,
id=5,
z_order=2,
attributes={
"x": 1,
"y": "2",
},
),
],
),
DatasetItem(
id=21,
media=Image.from_numpy(data=np.ones((10, 6, 3))),
subset="train",
annotations=[
Caption("test"),
Label(2),
Bbox(1, 2, 3, 4, label=5, id=42, group=42),
],
),
DatasetItem(
id=2,
media=Image.from_numpy(data=np.ones((10, 6, 3))),
subset=f"my{sep}val",
annotations=[
PolyLine([1, 2, 3, 4, 5, 6, 7, 8], id=11, z_order=1),
Polygon([1, 2, 3, 4, 5, 6, 7, 8], id=12, z_order=4),
],
),
DatasetItem(
id="1/1",
media=Image.from_numpy(data=np.ones((10, 6, 3))),
subset="test",
annotations=[
Cuboid3d(
[1.0, 2.0, 3.0],
[2.0, 2.0, 4.0],
[1.0, 3.0, 4.0],
id=6,
label=0,
attributes={"occluded": True},
group=6,
)
],
),
DatasetItem(
id=42,
media=Image.from_numpy(data=np.ones((10, 6, 3))),
subset=f"my{sep}test",
attributes={"a1": 5, "a2": "42"},
),
DatasetItem(
id=42,
media=Image.from_numpy(data=np.ones((10, 6, 3))),
# id and group integer value can be higher than 32bits limits (COCO instances).
annotations=[
Mask(
id=900100087038, group=900100087038, image=np.ones((2, 3), dtype=np.uint8)
),
RleMask(
rle=mask_tools.encode(np.ones((2, 3), dtype=np.uint8, order="F")),
id=900100087038,
group=900100087038,
),
],
),
DatasetItem(
id="1/b/c",
media=Image.from_file(path="1/b/c.qq", size=(2, 4)),
),
],
categories={
AnnotationType.label: label_categories,
AnnotationType.mask: mask_categories,
AnnotationType.points: points_categories,
},
infos={
"string": "test",
"int": 0,
"float": 0.0,
"string_list": ["test0", "test1", "test2"],
"int_list": [0, 1, 2],
"float_list": [0.0, 0.1, 0.2],
},
)


@pytest.fixture
def fxt_test_datumaro_format_video_dataset(test_dir) -> Dataset:
video_path = osp.join(test_dir, "video.avi")
Expand Down
Loading
Loading