Skip to content

Commit

Permalink
[Datumaro] Add dataset statistics (#1668)
Browse files Browse the repository at this point in the history
* Add statistics command

* Add tests

* Update changelog

* fix test

* handle image absence

Co-authored-by: Nikita Manovich <40690625+nmanovic@users.noreply.github.com>
  • Loading branch information
zhiltsov-max and nmanovic authored Aug 7, 2020
1 parent 5a8d719 commit eaeb67d
Show file tree
Hide file tree
Showing 4 changed files with 273 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Siammask tracker as DL serverless function (<https://github.com/opencv/cvat/pull/1988>)
- [Datumaro] Added model info and source info commands (<https://github.com/opencv/cvat/pull/1973>)
- [Datumaro] Dataset statistics (<https://github.com/opencv/cvat/pull/1668>)

### Changed
- Shape coordinates are rounded to 2 digits in dumped annotations (<https://github.com/opencv/cvat/pull/1970>)
Expand Down
27 changes: 11 additions & 16 deletions datumaro/datumaro/cli/contexts/project/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
from datumaro.components.dataset_filter import DatasetItemEncoder
from datumaro.components.extractor import AnnotationType
from datumaro.components.cli_plugin import CliPlugin
from datumaro.components.operations import mean_std
from datumaro.components.operations import \
compute_image_statistics, compute_ann_statistics
from .diff import DiffVisualizer
from ...util import add_subparser, CliException, MultilineFormatter, \
make_file_name
Expand Down Expand Up @@ -648,22 +649,16 @@ def build_stats_parser(parser_ctor=argparse.ArgumentParser):

def stats_command(args):
project = load_project(args.project_dir)
dataset = project.make_dataset()

def print_extractor_info(extractor, indent=''):
mean, std = mean_std(dataset)
print("%sImage mean:" % indent, ', '.join('%.3f' % n for n in mean))
print("%sImage std:" % indent, ', '.join('%.3f' % n for n in std))

print("Dataset: ")
print_extractor_info(dataset)

if 1 < len(dataset.subsets()):
print("Subsets: ")
for subset_name in dataset.subsets():
subset = dataset.get_subset(subset_name)
print(" %s:" % subset_name)
print_extractor_info(subset, " " * 4)
dataset = project.make_dataset()
stats = {}
stats.update(compute_image_statistics(dataset))
stats.update(compute_ann_statistics(dataset))

dst_file = generate_next_file_name('statistics', ext='.json')
log.info("Writing project statistics to '%s'" % dst_file)
with open(dst_file, 'w') as f:
json.dump(stats, f, indent=4, sort_keys=True)

def build_info_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(help="Get project info",
Expand Down
155 changes: 155 additions & 0 deletions datumaro/datumaro/components/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
#
# SPDX-License-Identifier: MIT

import logging as log
from copy import deepcopy

import cv2
import numpy as np

from datumaro.components.extractor import AnnotationType


def mean_std(dataset):
"""
Expand All @@ -14,6 +19,8 @@ def mean_std(dataset):
# Use an online algorithm to:
# - handle different image sizes
# - avoid cancellation problem
if len(dataset) == 0:
return [0, 0, 0], [0, 0, 0]

stats = np.empty((len(dataset), 2, 3), dtype=np.double)
counts = np.empty(len(dataset), dtype=np.uint32)
Expand Down Expand Up @@ -80,3 +87,151 @@ def compute_stats(stats, counts, mean_accessor, variance_accessor):
*__class__.compute_stats(stats[:h], counts[:h], m, v),
*__class__.compute_stats(stats[h:], counts[h:], m, v)
)

def compute_image_statistics(dataset):
stats = {
'dataset': {},
'subsets': {}
}

def _extractor_stats(extractor):
available = True
for item in extractor:
if not (item.has_image and item.image.has_data):
available = False
log.warn("Item %s has no image. Image stats won't be computed",
item.id)
break

stats = {
'images count': len(extractor),
}

if available:
mean, std = mean_std(extractor)
stats.update({
'image mean': [float(n) for n in mean[::-1]],
'image std': [float(n) for n in std[::-1]],
})
else:
stats.update({
'image mean': 'n/a',
'image std': 'n/a',
})
return stats

stats['dataset'].update(_extractor_stats(dataset))

subsets = dataset.subsets() or [None]
if subsets and 0 < len([s for s in subsets if s]):
for subset_name in subsets:
stats['subsets'][subset_name] = _extractor_stats(
dataset.get_subset(subset_name))

return stats

def compute_ann_statistics(dataset):
labels = dataset.categories().get(AnnotationType.label)
def get_label(ann):
return labels.items[ann.label].name if ann.label is not None else None

stats = {
'images count': len(dataset),
'annotations count': 0,
'unannotated images count': 0,
'unannotated images': [],
'annotations by type': { t.name: {
'count': 0,
} for t in AnnotationType },
'annotations': {},
}
by_type = stats['annotations by type']

attr_template = {
'count': 0,
'values count': 0,
'values present': set(),
'distribution': {}, # value -> (count, total%)
}
label_stat = {
'count': 0,
'distribution': { l.name: [0, 0] for l in labels.items
}, # label -> (count, total%)

'attributes': {},
}
stats['annotations']['labels'] = label_stat
segm_stat = {
'avg. area': 0,
'area distribution': [], # a histogram with 10 bins
# (min, min+10%), ..., (min+90%, max) -> (count, total%)

'pixel distribution': { l.name: [0, 0] for l in labels.items
}, # label -> (count, total%)
}
stats['annotations']['segments'] = segm_stat
segm_areas = []
pixel_dist = segm_stat['pixel distribution']
total_pixels = 0

for item in dataset:
if len(item.annotations) == 0:
stats['unannotated images'].append(item.id)
continue

for ann in item.annotations:
by_type[ann.type.name]['count'] += 1

if not hasattr(ann, 'label') or ann.label is None:
continue

if ann.type in {AnnotationType.mask,
AnnotationType.polygon, AnnotationType.bbox}:
area = ann.get_area()
segm_areas.append(area)
pixel_dist[get_label(ann)][0] += int(area)

label_stat['count'] += 1
label_stat['distribution'][get_label(ann)][0] += 1

for name, value in ann.attributes.items():
if name.lower() in { 'occluded', 'visibility', 'score',
'id', 'track_id' }:
continue
attrs_stat = label_stat['attributes'].setdefault(name,
deepcopy(attr_template))
attrs_stat['count'] += 1
attrs_stat['values present'].add(str(value))
attrs_stat['distribution'] \
.setdefault(str(value), [0, 0])[0] += 1

stats['annotations count'] = sum(t['count'] for t in
stats['annotations by type'].values())
stats['unannotated images count'] = len(stats['unannotated images'])

for label_info in label_stat['distribution'].values():
label_info[1] = label_info[0] / label_stat['count']

for label_attr in label_stat['attributes'].values():
label_attr['values count'] = len(label_attr['values present'])
label_attr['values present'] = sorted(label_attr['values present'])
for attr_info in label_attr['distribution'].values():
attr_info[1] = attr_info[0] / label_attr['count']

# numpy.sum might be faster, but could overflow with large datasets.
# Python's int can transparently mutate to be of indefinite precision (long)
total_pixels = sum(int(a) for a in segm_areas)

segm_stat['avg. area'] = total_pixels / (len(segm_areas) or 1.0)

for label_info in segm_stat['pixel distribution'].values():
label_info[1] = label_info[0] / total_pixels

if len(segm_areas) != 0:
hist, bins = np.histogram(segm_areas)
segm_stat['area distribution'] = [{
'min': float(bin_min), 'max': float(bin_max),
'count': int(c), 'percent': int(c) / len(segm_areas)
} for c, (bin_min, bin_max) in zip(hist, zip(bins[:-1], bins[1:]))]

return stats
109 changes: 106 additions & 3 deletions datumaro/tests/test_ops.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np

from datumaro.components.extractor import Extractor, DatasetItem
from datumaro.components.operations import mean_std
from datumaro.components.extractor import (Extractor, DatasetItem, Label,
Mask, Bbox, Points, Caption)
from datumaro.components.project import Dataset
from datumaro.components.operations import mean_std, compute_ann_statistics

from unittest import TestCase

Expand All @@ -28,4 +30,105 @@ def __iter__(self):
for em, am in zip(expected_mean, actual_mean):
self.assertAlmostEqual(em, am, places=0)
for estd, astd in zip(expected_std, actual_std):
self.assertAlmostEqual(estd, astd, places=0)
self.assertAlmostEqual(estd, astd, places=0)

def test_stats(self):
dataset = Dataset.from_iterable([
DatasetItem(id=1, image=np.ones((5, 5, 3)), annotations=[
Caption('hello'),
Caption('world'),
Label(2, attributes={ 'x': 1, 'y': '2', }),
Bbox(1, 2, 2, 2, label=2, attributes={ 'score': 0.5, }),
Bbox(5, 6, 2, 2, attributes={
'x': 1, 'y': '3', 'occluded': True,
}),
Points([1, 2, 2, 0, 1, 1], label=0),
Mask(label=3, image=np.array([
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 1, 1, 1],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
])),
]),
DatasetItem(id=2, image=np.ones((2, 4, 3)), annotations=[
Label(2, attributes={ 'x': 2, 'y': '2', }),
Bbox(1, 2, 2, 2, label=3, attributes={ 'score': 0.5, }),
Bbox(5, 6, 2, 2, attributes={
'x': 2, 'y': '3', 'occluded': False,
}),
]),
DatasetItem(id=3),
], categories=['label_%s' % i for i in range(4)])

expected = {
'images count': 3,
'annotations count': 10,
'unannotated images count': 1,
'unannotated images': ['3'],
'annotations by type': {
'label': { 'count': 2, },
'polygon': { 'count': 0, },
'polyline': { 'count': 0, },
'bbox': { 'count': 4, },
'mask': { 'count': 1, },
'points': { 'count': 1, },
'caption': { 'count': 2, },
},
'annotations': {
'labels': {
'count': 6,
'distribution': {
'label_0': [1, 1/6],
'label_1': [0, 0.0],
'label_2': [3, 3/6],
'label_3': [2, 2/6],
},
'attributes': {
'x': {
'count': 2, # unnotations with no label are skipped
'values count': 2,
'values present': ['1', '2'],
'distribution': {
'1': [1, 1/2],
'2': [1, 1/2],
},
},
'y': {
'count': 2, # unnotations with no label are skipped
'values count': 1,
'values present': ['2'],
'distribution': {
'2': [2, 2/2],
},
},
# must not include "special" attributes like "occluded"
}
},
'segments': {
'avg. area': (4 * 2 + 9 * 1) / 3,
'area distribution': [
{'min': 4.0, 'max': 4.5, 'count': 2, 'percent': 2/3},
{'min': 4.5, 'max': 5.0, 'count': 0, 'percent': 0.0},
{'min': 5.0, 'max': 5.5, 'count': 0, 'percent': 0.0},
{'min': 5.5, 'max': 6.0, 'count': 0, 'percent': 0.0},
{'min': 6.0, 'max': 6.5, 'count': 0, 'percent': 0.0},
{'min': 6.5, 'max': 7.0, 'count': 0, 'percent': 0.0},
{'min': 7.0, 'max': 7.5, 'count': 0, 'percent': 0.0},
{'min': 7.5, 'max': 8.0, 'count': 0, 'percent': 0.0},
{'min': 8.0, 'max': 8.5, 'count': 0, 'percent': 0.0},
{'min': 8.5, 'max': 9.0, 'count': 1, 'percent': 1/3},
],
'pixel distribution': {
'label_0': [0, 0.0],
'label_1': [0, 0.0],
'label_2': [4, 4/17],
'label_3': [13, 13/17],
},
}
},
}

actual = compute_ann_statistics(dataset)

self.assertEqual(expected, actual)

0 comments on commit eaeb67d

Please sign in to comment.