[Datumaro] Add dataset statistics (#1668)

* Add statistics command * Add tests * Update changelog * fix test * handle image absence Co-authored-by: Nikita Manovich <40690625+nmanovic@users.noreply.github.com>
cvat-ai · Aug 7, 2020 · eaeb67d · eaeb67d
1 parent 5a8d719
commit eaeb67d
Show file tree

Hide file tree

Showing 4 changed files with 273 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Siammask tracker as DL serverless function (<https://github.com/opencv/cvat/pull/1988>)
 - [Datumaro] Added model info and source info commands (<https://github.com/opencv/cvat/pull/1973>)
+- [Datumaro] Dataset statistics (<https://github.com/opencv/cvat/pull/1668>)
 
 ### Changed
 - Shape coordinates are rounded to 2 digits in dumped annotations (<https://github.com/opencv/cvat/pull/1970>)

diff --git a/datumaro/datumaro/cli/contexts/project/__init__.py b/datumaro/datumaro/cli/contexts/project/__init__.py
@@ -17,7 +17,8 @@
 from datumaro.components.dataset_filter import DatasetItemEncoder
 from datumaro.components.extractor import AnnotationType
 from datumaro.components.cli_plugin import CliPlugin
-from datumaro.components.operations import mean_std
+from datumaro.components.operations import \
+    compute_image_statistics, compute_ann_statistics
 from .diff import DiffVisualizer
 from ...util import add_subparser, CliException, MultilineFormatter, \
     make_file_name
@@ -648,22 +649,16 @@ def build_stats_parser(parser_ctor=argparse.ArgumentParser):
 
 def stats_command(args):
     project = load_project(args.project_dir)
-    dataset = project.make_dataset()
 
-    def print_extractor_info(extractor, indent=''):
-        mean, std = mean_std(dataset)
-        print("%sImage mean:" % indent, ', '.join('%.3f' % n for n in mean))
-        print("%sImage std:" % indent, ', '.join('%.3f' % n for n in std))
-
-    print("Dataset: ")
-    print_extractor_info(dataset)
-
-    if 1 < len(dataset.subsets()):
-        print("Subsets: ")
-        for subset_name in dataset.subsets():
-            subset = dataset.get_subset(subset_name)
-            print("  %s:" % subset_name)
-            print_extractor_info(subset, " " * 4)
+    dataset = project.make_dataset()
+    stats = {}
+    stats.update(compute_image_statistics(dataset))
+    stats.update(compute_ann_statistics(dataset))
+
+    dst_file = generate_next_file_name('statistics', ext='.json')
+    log.info("Writing project statistics to '%s'" % dst_file)
+    with open(dst_file, 'w') as f:
+        json.dump(stats, f, indent=4, sort_keys=True)
 
 def build_info_parser(parser_ctor=argparse.ArgumentParser):
     parser = parser_ctor(help="Get project info",

diff --git a/datumaro/datumaro/components/operations.py b/datumaro/datumaro/components/operations.py
@@ -3,9 +3,14 @@
 #
 # SPDX-License-Identifier: MIT
 
+import logging as log
+from copy import deepcopy
+
 import cv2
 import numpy as np
 
+from datumaro.components.extractor import AnnotationType
+
 
 def mean_std(dataset):
     """
@@ -14,6 +19,8 @@ def mean_std(dataset):
     # Use an online algorithm to:
     # - handle different image sizes
     # - avoid cancellation problem
+    if len(dataset) == 0:
+        return [0, 0, 0], [0, 0, 0]
 
     stats = np.empty((len(dataset), 2, 3), dtype=np.double)
     counts = np.empty(len(dataset), dtype=np.uint32)
@@ -80,3 +87,151 @@ def compute_stats(stats, counts, mean_accessor, variance_accessor):
             *__class__.compute_stats(stats[:h], counts[:h], m, v),
             *__class__.compute_stats(stats[h:], counts[h:], m, v)
             )
+
+def compute_image_statistics(dataset):
+    stats = {
+        'dataset': {},
+        'subsets': {}
+    }
+
+    def _extractor_stats(extractor):
+        available = True
+        for item in extractor:
+            if not (item.has_image and item.image.has_data):
+                available = False
+                log.warn("Item %s has no image. Image stats won't be computed",
+                    item.id)
+                break
+
+        stats = {
+            'images count': len(extractor),
+        }
+
+        if available:
+            mean, std = mean_std(extractor)
+            stats.update({
+                'image mean': [float(n) for n in mean[::-1]],
+                'image std': [float(n) for n in std[::-1]],
+            })
+        else:
+            stats.update({
+                'image mean': 'n/a',
+                'image std': 'n/a',
+            })
+        return stats
+
+    stats['dataset'].update(_extractor_stats(dataset))
+
+    subsets = dataset.subsets() or [None]
+    if subsets and 0 < len([s for s in subsets if s]):
+        for subset_name in subsets:
+            stats['subsets'][subset_name] = _extractor_stats(
+                dataset.get_subset(subset_name))
+
+    return stats
+
+def compute_ann_statistics(dataset):
+    labels = dataset.categories().get(AnnotationType.label)
+    def get_label(ann):
+        return labels.items[ann.label].name if ann.label is not None else None
+
+    stats = {
+        'images count': len(dataset),
+        'annotations count': 0,
+        'unannotated images count': 0,
+        'unannotated images': [],
+        'annotations by type': { t.name: {
+            'count': 0,
+        } for t in AnnotationType },
+        'annotations': {},
+    }
+    by_type = stats['annotations by type']
+
+    attr_template = {
+        'count': 0,
+        'values count': 0,
+        'values present': set(),
+        'distribution': {}, # value -> (count, total%)
+    }
+    label_stat = {
+        'count': 0,
+        'distribution': { l.name: [0, 0] for l in labels.items
+        }, # label -> (count, total%)
+
+        'attributes': {},
+    }
+    stats['annotations']['labels'] = label_stat
+    segm_stat = {
+        'avg. area': 0,
+        'area distribution': [], # a histogram with 10 bins
+        # (min, min+10%), ..., (min+90%, max) -> (count, total%)
+
+        'pixel distribution': { l.name: [0, 0] for l in labels.items
+        }, # label -> (count, total%)
+    }
+    stats['annotations']['segments'] = segm_stat
+    segm_areas = []
+    pixel_dist = segm_stat['pixel distribution']
+    total_pixels = 0
+
+    for item in dataset:
+        if len(item.annotations) == 0:
+            stats['unannotated images'].append(item.id)
+            continue
+
+        for ann in item.annotations:
+            by_type[ann.type.name]['count'] += 1
+
+            if not hasattr(ann, 'label') or ann.label is None:
+                continue
+
+            if ann.type in {AnnotationType.mask,
+                    AnnotationType.polygon, AnnotationType.bbox}:
+                area = ann.get_area()
+                segm_areas.append(area)
+                pixel_dist[get_label(ann)][0] += int(area)
+
+            label_stat['count'] += 1
+            label_stat['distribution'][get_label(ann)][0] += 1
+
+            for name, value in ann.attributes.items():
+                if name.lower() in { 'occluded', 'visibility', 'score',
+                        'id', 'track_id' }:
+                    continue
+                attrs_stat = label_stat['attributes'].setdefault(name,
+                    deepcopy(attr_template))
+                attrs_stat['count'] += 1
+                attrs_stat['values present'].add(str(value))
+                attrs_stat['distribution'] \
+                    .setdefault(str(value), [0, 0])[0] += 1
+
+    stats['annotations count'] = sum(t['count'] for t in
+        stats['annotations by type'].values())
+    stats['unannotated images count'] = len(stats['unannotated images'])
+
+    for label_info in label_stat['distribution'].values():
+        label_info[1] = label_info[0] / label_stat['count']
+
+    for label_attr in label_stat['attributes'].values():
+        label_attr['values count'] = len(label_attr['values present'])
+        label_attr['values present'] = sorted(label_attr['values present'])
+        for attr_info in label_attr['distribution'].values():
+            attr_info[1] = attr_info[0] / label_attr['count']
+
+    # numpy.sum might be faster, but could overflow with large datasets.
+    # Python's int can transparently mutate to be of indefinite precision (long)
+    total_pixels = sum(int(a) for a in segm_areas)
+
+    segm_stat['avg. area'] = total_pixels / (len(segm_areas) or 1.0)
+
+    for label_info in segm_stat['pixel distribution'].values():
+        label_info[1] = label_info[0] / total_pixels
+
+    if len(segm_areas) != 0:
+        hist, bins = np.histogram(segm_areas)
+        segm_stat['area distribution'] = [{
+            'min': float(bin_min), 'max': float(bin_max),
+            'count': int(c), 'percent': int(c) / len(segm_areas)
+        } for c, (bin_min, bin_max) in zip(hist, zip(bins[:-1], bins[1:]))]
+
+    return stats
diff --git a/datumaro/tests/test_ops.py b/datumaro/tests/test_ops.py
@@ -1,7 +1,9 @@
 import numpy as np
 
-from datumaro.components.extractor import Extractor, DatasetItem
-from datumaro.components.operations import mean_std
+from datumaro.components.extractor import (Extractor, DatasetItem, Label,
+    Mask, Bbox, Points, Caption)
+from datumaro.components.project import Dataset
+from datumaro.components.operations import mean_std, compute_ann_statistics
 
 from unittest import TestCase
 
@@ -28,4 +30,105 @@ def __iter__(self):
         for em, am in zip(expected_mean, actual_mean):
             self.assertAlmostEqual(em, am, places=0)
         for estd, astd in zip(expected_std, actual_std):
-            self.assertAlmostEqual(estd, astd, places=0)
+            self.assertAlmostEqual(estd, astd, places=0)
+
+    def test_stats(self):
+        dataset = Dataset.from_iterable([
+            DatasetItem(id=1, image=np.ones((5, 5, 3)), annotations=[
+                Caption('hello'),
+                Caption('world'),
+                Label(2, attributes={ 'x': 1, 'y': '2', }),
+                Bbox(1, 2, 2, 2, label=2, attributes={ 'score': 0.5, }),
+                Bbox(5, 6, 2, 2, attributes={
+                    'x': 1, 'y': '3', 'occluded': True,
+                }),
+                Points([1, 2, 2, 0, 1, 1], label=0),
+                Mask(label=3, image=np.array([
+                    [0, 0, 1, 1, 1],
+                    [0, 0, 1, 1, 1],
+                    [0, 0, 1, 1, 1],
+                    [0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0],
+                ])),
+            ]),
+            DatasetItem(id=2, image=np.ones((2, 4, 3)), annotations=[
+                Label(2, attributes={ 'x': 2, 'y': '2', }),
+                Bbox(1, 2, 2, 2, label=3, attributes={ 'score': 0.5, }),
+                Bbox(5, 6, 2, 2, attributes={
+                    'x': 2, 'y': '3', 'occluded': False,
+                }),
+            ]),
+            DatasetItem(id=3),
+        ], categories=['label_%s' % i for i in range(4)])
+
+        expected = {
+            'images count': 3,
+            'annotations count': 10,
+            'unannotated images count': 1,
+            'unannotated images': ['3'],
+            'annotations by type': {
+                'label': { 'count': 2, },
+                'polygon': { 'count': 0, },
+                'polyline': { 'count': 0, },
+                'bbox': { 'count': 4, },
+                'mask': { 'count': 1, },
+                'points': { 'count': 1, },
+                'caption': { 'count': 2, },
+            },
+            'annotations': {
+                'labels': {
+                    'count': 6,
+                    'distribution': {
+                        'label_0': [1, 1/6],
+                        'label_1': [0, 0.0],
+                        'label_2': [3, 3/6],
+                        'label_3': [2, 2/6],
+                    },
+                    'attributes': {
+                        'x': {
+                            'count': 2, # unnotations with no label are skipped
+                            'values count': 2,
+                            'values present': ['1', '2'],
+                            'distribution': {
+                                '1': [1, 1/2],
+                                '2': [1, 1/2],
+                            },
+                        },
+                        'y': {
+                            'count': 2, # unnotations with no label are skipped
+                            'values count': 1,
+                            'values present': ['2'],
+                            'distribution': {
+                                '2': [2, 2/2],
+                            },
+                        },
+                        # must not include "special" attributes like "occluded"
+                    }
+                },
+                'segments': {
+                    'avg. area': (4 * 2 + 9 * 1) / 3,
+                    'area distribution': [
+                        {'min': 4.0, 'max': 4.5, 'count': 2, 'percent': 2/3},
+                        {'min': 4.5, 'max': 5.0, 'count': 0, 'percent': 0.0},
+                        {'min': 5.0, 'max': 5.5, 'count': 0, 'percent': 0.0},
+                        {'min': 5.5, 'max': 6.0, 'count': 0, 'percent': 0.0},
+                        {'min': 6.0, 'max': 6.5, 'count': 0, 'percent': 0.0},
+                        {'min': 6.5, 'max': 7.0, 'count': 0, 'percent': 0.0},
+                        {'min': 7.0, 'max': 7.5, 'count': 0, 'percent': 0.0},
+                        {'min': 7.5, 'max': 8.0, 'count': 0, 'percent': 0.0},
+                        {'min': 8.0, 'max': 8.5, 'count': 0, 'percent': 0.0},
+                        {'min': 8.5, 'max': 9.0, 'count': 1, 'percent': 1/3},
+                    ],
+                    'pixel distribution': {
+                        'label_0': [0, 0.0],
+                        'label_1': [0, 0.0],
+                        'label_2': [4, 4/17],
+                        'label_3': [13, 13/17],
+                    },
+                }
+            },
+        }
+
+        actual = compute_ann_statistics(dataset)
+
+        self.assertEqual(expected, actual)