Enabled support for 'Video' media type in the datumaro format (#1491)

### Summary Enabled support for 'Video' media type in the datumaro format to support annotations by video or video range. Note that the video has a closed interval of [start_frame, end_frame].  ### How to test  ### Checklist  - [x] I have added unit tests to cover my changes. - [x] I have added integration tests to cover my changes. - [x] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md). - [x] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ```
openvinotoolkit · May 10, 2024 · 4394c6a · 4394c6a
1 parent b5e2ad8
commit 4394c6a
Show file tree

Hide file tree

Showing 21 changed files with 660 additions and 191 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Q2 2024 Release 1.7.0
 ### New features
+- Support 'Video' media type in datumaro format
+  (<https://github.com/openvinotoolkit/datumaro/pull/1491>)
 - Add ann_types property for dataset
   (<https://github.com/openvinotoolkit/datumaro/pull/1422>, <https://github.com/openvinotoolkit/datumaro/pull/1479>)
 - Add AnnotationType.rotated_bbox for oriented object detection

diff --git a/docs/source/docs/command-reference/context/util.md b/docs/source/docs/command-reference/context/util.md
@@ -17,6 +17,7 @@ the dataset reproducible and stable.
 This command provides different options like setting the frame step
 (the `-s/--step` option), file name pattern (`-n/--name-pattern`),
 starting (`-b/--start-frame`) and finishing (`-e/--end-frame`) frame etc.
+Note that starting and finishing frames denote a closed interval [`start-frame`, `end-frame`].
 
 Note that this command is equivalent to the following commands:
 ```bash

diff --git a/docs/source/docs/data-formats/formats/datumaro.md b/docs/source/docs/data-formats/formats/datumaro.md
@@ -11,6 +11,7 @@ Supported media types:
 
 - `Image`
 - `PointCloud`
+- `Video`
 - `VideoFrame`
 
 Supported annotation types:

diff --git a/docs/source/docs/data-formats/formats/datumaro_binary.md b/docs/source/docs/data-formats/formats/datumaro_binary.md
@@ -59,6 +59,7 @@ Supported media types:
 
 - `Image`
 - `PointCloud`
+- `Video`
 - `VideoFrame`
 
 Supported annotation types:

diff --git a/docs/source/docs/data-formats/formats/video.md b/docs/source/docs/data-formats/formats/video.md
@@ -31,6 +31,7 @@ dataset = dm.Dataset.import_from('<path_to_video>', format='video_frames')
 
 Datumaro has few import options for `video_frames` format, to apply them
 use the `--` after the main command argument.
+Note that a video has a closed interval of [`start-frame`, `end-frame`].
 
 `video_frames` import options:
 - `--subset` (string) - The name of the subset for the produced

diff --git a/src/datumaro/components/dataset.py b/src/datumaro/components/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020-2023 Intel Corporation
+# Copyright (C) 2020-2024 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -41,6 +41,7 @@
 from datumaro.components.environment import DEFAULT_ENVIRONMENT, Environment
 from datumaro.components.errors import (
     DatasetImportError,
+    DatumaroError,
     MultipleFormatsMatchError,
     NoMatchingFormatsError,
     StreamedItemError,
@@ -888,6 +889,10 @@ def import_from(
             cause = e.__cause__ if getattr(e, "__cause__", None) is not None else e
             cause.__traceback__ = e.__traceback__
             raise DatasetImportError(f"Failed to import dataset '{format}' at '{path}'.") from cause
+        except DatumaroError as e:
+            cause = e.__cause__ if getattr(e, "__cause__", None) is not None else e
+            cause.__traceback__ = e.__traceback__
+            raise DatasetImportError(f"Failed to import dataset '{format}' at '{path}'.") from cause
         except Exception as e:
             raise DatasetImportError(f"Failed to import dataset '{format}' at '{path}'.") from e
 

diff --git a/src/datumaro/components/exporter.py b/src/datumaro/components/exporter.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2019-2023 Intel Corporation
+# Copyright (C) 2019-2024 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -22,7 +22,7 @@
     DatumaroError,
     ItemExportError,
 )
-from datumaro.components.media import Image, PointCloud, VideoFrame
+from datumaro.components.media import Image, PointCloud, Video, VideoFrame
 from datumaro.components.progress_reporting import NullProgressReporter, ProgressReporter
 from datumaro.util.meta_file_util import save_hashkey_file, save_meta_file
 from datumaro.util.os_util import rmtree
@@ -339,10 +339,15 @@ def make_pcd_extra_image_filename(self, item, idx, image, *, name=None, subdir=N
         ) + self.find_image_ext(image)
 
     def make_video_filename(self, item, *, name=None):
-        if isinstance(item, DatasetItem) and isinstance(item.media, VideoFrame):
+        STR_WRONG_MEDIA_TYPE = "Video item's media type should be Video or VideoFrame"
+        assert isinstance(item, DatasetItem), STR_WRONG_MEDIA_TYPE
+
+        if isinstance(item.media, VideoFrame):
             video_file_name = osp.basename(item.media.video.path)
+        elif isinstance(item.media, Video):
+            video_file_name = osp.basename(item.media.path)
         else:
-            assert "Video item type should be VideoFrame"
+            assert False, STR_WRONG_MEDIA_TYPE
 
         return video_file_name
 
@@ -403,7 +408,7 @@ def save_video(
         subdir: Optional[str] = None,
         fname: Optional[str] = None,
     ):
-        if not item.media or not isinstance(item.media, VideoFrame):
+        if not item.media or not isinstance(item.media, (Video, VideoFrame)):
             log.warning("Item '%s' has no video", item.id)
             return
         basedir = self._video_dir if basedir is None else basedir
@@ -415,7 +420,10 @@ def save_video(
 
         os.makedirs(osp.dirname(path), exist_ok=True)
 
-        item.media.video.save(path, crypter=NULL_CRYPTER)
+        if isinstance(item.media, VideoFrame):
+            item.media.video.save(path, crypter=NULL_CRYPTER)
+        else:  # Video
+            item.media.save(path, crypter=NULL_CRYPTER)
 
     @property
     def images_dir(self) -> str:

diff --git a/src/datumaro/components/media.py b/src/datumaro/components/media.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2021-2022 Intel Corporation
+# Copyright (C) 2021-2024 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 
@@ -94,7 +94,7 @@ def media(self) -> Optional[Type[MediaElement]]:
 class MediaElement(Generic[AnyData]):
     _type = MediaType.MEDIA_ELEMENT
 
-    def __init__(self, crypter: Crypter = NULL_CRYPTER) -> None:
+    def __init__(self, crypter: Crypter = NULL_CRYPTER, *args, **kwargs) -> None:
         self._crypter = crypter
 
     def as_dict(self) -> Dict[str, Any]:
@@ -488,6 +488,26 @@ def video(self) -> Video:
     def path(self) -> str:
         return self._video.path
 
+    def from_self(self, **kwargs):
+        attrs = deepcopy(self.as_dict())
+        if "path" in kwargs:
+            attrs.update({"video": self.video.from_self(**kwargs)})
+            kwargs.pop("path")
+        attrs.update(kwargs)
+        return self.__class__(**attrs)
+
+    def __getstate__(self):
+        # Return only the picklable parts of the state.
+        state = self.__dict__.copy()
+        del state["_data"]
+        return state
+
+    def __setstate__(self, state):
+        # Restore the objects' state.
+        self.__dict__.update(state)
+        # Reinitialize unpichlable attributes
+        self._data = lambda: self._video.get_frame_data(self._index)
+
 
 class _VideoFrameIterator(Iterator[VideoFrame]):
     """
@@ -527,6 +547,11 @@ def _decode(self, cap) -> Iterator[VideoFrame]:
 
         if self._video._frame_count is None:
             self._video._frame_count = self._pos + 1
+            if self._video._end_frame and self._video._end_frame >= self._video._frame_count:
+                raise ValueError(
+                    f"The end_frame value({self._video._end_frame}) of the video "
+                    f"must be less than the frame count({self._video._frame_count})."
+                )
 
     def _make_frame(self, index) -> VideoFrame:
         return VideoFrame(self._video, index=index)
@@ -575,13 +600,22 @@ class Video(MediaElement, Iterable[VideoFrame]):
     """
 
     def __init__(
-        self, path: str, *, step: int = 1, start_frame: int = 0, end_frame: Optional[int] = None
+        self,
+        path: str,
+        step: int = 1,
+        start_frame: int = 0,
+        end_frame: Optional[int] = None,
+        *args,
+        **kwargs,
     ) -> None:
-        super().__init__()
+        super().__init__(*args, **kwargs)
         self._path = path
 
+        assert 0 <= start_frame
         if end_frame:
-            assert start_frame < end_frame
+            assert start_frame <= end_frame
+            # we can't know the video length here,
+            # so we cannot validate if the end_frame is valid.
         assert 0 < step
         self._step = step
         self._start_frame = start_frame
@@ -630,7 +664,7 @@ def __iter__(self) -> Iterator[VideoFrame]:
             # Decoding is not necessary to get frame pointers
             # However, it can be inacurrate
             end_frame = self._get_end_frame()
-            for index in range(self._start_frame, end_frame, self._step):
+            for index in range(self._start_frame, end_frame + 1, self._step):
                 yield VideoFrame(video=self, index=index)
         else:
             # Need to decode to iterate over frames
@@ -639,7 +673,8 @@ def __iter__(self) -> Iterator[VideoFrame]:
     @property
     def length(self) -> Optional[int]:
         """
-        Returns frame count, if video provides such information.
+        Returns frame count of the closed interval [start_frame, end_frame],
+        if video provides such information.
 
         Note that not all videos provide length / duration metainfo, so the
         result may be undefined.
@@ -655,12 +690,15 @@ def length(self) -> Optional[int]:
         if self._length is None:
             end_frame = self._get_end_frame()
 
-            length = None
             if end_frame is not None:
-                length = (end_frame - self._start_frame) // self._step
-                assert 0 < length
-
-            self._length = length
+                length = (end_frame + 1 - self._start_frame) // self._step
+                if 0 >= length:
+                    raise ValueError(
+                        "There is no valid frame for the closed interval"
+                        f"[start_frame({self._start_frame}),"
+                        f" end_frame({end_frame})] with step({self._step})."
+                    )
+                self._length = length
 
         return self._length
 
@@ -686,18 +724,23 @@ def _get_frame_size(self) -> Tuple[int, int]:
         return frame_size
 
     def _get_end_frame(self):
+        # Note that end_frame could less than the last frame of the video
         if self._end_frame is not None and self._frame_count is not None:
             end_frame = min(self._end_frame, self._frame_count)
+        elif self._end_frame is not None:
+            end_frame = self._end_frame
+        elif self._frame_count is not None:
+            end_frame = self._frame_count - 1
         else:
-            end_frame = self._end_frame or self._frame_count
+            end_frame = None
 
         return end_frame
 
     def _includes_frame(self, i):
-        end_frame = self._get_end_frame()
         if self._start_frame <= i:
             if (i - self._start_frame) % self._step == 0:
-                if end_frame is None or i < end_frame:
+                end_frame = self._get_end_frame()
+                if end_frame is None or i <= end_frame:
                     return True
 
         return False
@@ -719,15 +762,49 @@ def _reset_reader(self):
         assert self._reader.isOpened()
 
     def __eq__(self, other: object) -> bool:
+        def _get_frame(obj: Video, idx: int):
+            try:
+                return obj[idx]
+            except IndexError:
+                return None
+
         if not isinstance(other, __class__):
             return False
+        if self._start_frame != other._start_frame or self._step != other._step:
+            return False
 
-        return (
-            self.path == other.path
-            and self._start_frame == other._start_frame
-            and self._step == other._step
-            and self._end_frame == other._end_frame
-        )
+        # The video path can vary if a dataset is copied.
+        # So, we need to check if the video data is the same instead of checking paths.
+        if self._end_frame is not None and self._end_frame == other._end_frame:
+            for idx in range(self._start_frame, self._end_frame + 1, self._step):
+                if self[idx] != other[idx]:
+                    return False
+            return True
+
+        end_frame = self._end_frame or other._end_frame
+        if end_frame is None:
+            last_frame = None
+            for idx, frame in enumerate(self):
+                if frame != _get_frame(other, frame.index):
+                    return False
+                last_frame = frame
+            # check if the actual last frames are same
+            try:
+                other[last_frame.index + self._step if last_frame else self._start_frame]
+            except IndexError:
+                return True
+            return False
+
+        # _end_frame values, only one of the two is valid
+        for idx in range(self._start_frame, end_frame + 1, self._step):
+            frame = _get_frame(self, idx)
+            if frame is None:
+                return False
+            if frame != _get_frame(other, idx):
+                return False
+        # check if the actual last frames are same
+        idx_next = end_frame + self._step
+        return None is (_get_frame(self, idx_next) or _get_frame(other, idx_next))
 
     def __hash__(self):
         # Required for caching
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ Supported media types: @@
     - `Image`
     - `PointCloud`
+    - `Video`
     - `VideoFrame`
     Supported annotation types:
@@ Expand Down @@