diff --git a/CHANGELOG.md b/CHANGELOG.md index 89d923d4fb..0e26aac1df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 () - Fix a bug in the previous behavior when importing nested datasets in the project () +- Fix Kaggle importer when adding duplicated labels + () ## 16/11/2023 - Release 1.5.1 ### Enhancements diff --git a/src/datumaro/plugins/data_formats/kaggle/base.py b/src/datumaro/plugins/data_formats/kaggle/base.py index 71d3335c34..1510a49318 100644 --- a/src/datumaro/plugins/data_formats/kaggle/base.py +++ b/src/datumaro/plugins/data_formats/kaggle/base.py @@ -307,13 +307,13 @@ def __init__( for img_filename in os.listdir(path): if not img_filename.lower().endswith(tuple(IMAGE_EXTENSIONS)): continue - item_id = os.path.splitext(img_filename)[0] + item_id = osp.splitext(img_filename)[0] - img_file = os.path.join(path, img_filename) - ann_file = os.path.join(ann_path, item_id + self.ann_extensions) + img_file = osp.join(path, img_filename) + ann_file = osp.join(ann_path, item_id + self.ann_extensions) annotations = ( - self._parse_annotations(img_file, ann_file) if os.path.isfile(ann_file) else [] + self._parse_annotations(img_file, ann_file) if osp.isfile(ann_file) else [] ) media = Image.from_file(path=img_file, size=self._size) @@ -351,8 +351,11 @@ def _parse_annotations(self, img_file: str, ann_file: str): ymin = self._parse_field(bbox_elem, "ymin", float) ymax = self._parse_field(bbox_elem, "ymax", float) - self._label_cat.add(label_name) - label_id, _ = self._label_cat.find(label_name) + label_id, cat = self._label_cat.find(label_name) + if not cat: + self._label_cat.add(label_name) + label_id, _ = self._label_cat.find(label_name) + annotations.append( Bbox(id=obj_id, label=label_id, x=xmin, y=ymin, w=xmax - xmin, h=ymax - ymin) ) @@ -419,7 +422,10 @@ def _parse_annotations(self, img_file: str, ann_file: str): w *= image_width h *= image_height - self._label_cat.add(label_name) + label_id, cat = self._label_cat.find(label_name) + if not cat: + self._label_cat.add(label_name) + label_id, _ = self._label_cat.find(label_name) label_id, _ = self._label_cat.find(label_name) annotations.append(Bbox(id=obj_id, label=label_id, x=x, y=y, w=w, h=h))