Skip to content

Commit

Permalink
Report read_time and write_time as a count of millis (#7323)
Browse files Browse the repository at this point in the history
This adds a monotonic count metric that reports the total time spent
reading and writing between check runs in milis, instead of a percentage.
  • Loading branch information
albertvaka authored Feb 24, 2021
1 parent e6cbc9b commit 00ba89c
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 24 deletions.
12 changes: 6 additions & 6 deletions disk/datadog_checks/disk/disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,17 +314,17 @@ def collect_latency_metrics(self):
for disk_name, disk in iteritems(psutil.disk_io_counters(True)):
self.log.debug('IO Counters: %s -> %s', disk_name, disk)
try:
# x100 to have it as a percentage,
# /1000 as psutil returns the value in ms
read_time_pct = disk.read_time * 100 / 1000
write_time_pct = disk.write_time * 100 / 1000
metric_tags = [] if self._custom_tags is None else self._custom_tags[:]
metric_tags.append('device:{}'.format(disk_name))
metric_tags.append('device_name:{}'.format(_base_device_name(disk_name)))
if self.devices_label.get(disk_name):
metric_tags.extend(self.devices_label.get(disk_name))
self.rate(self.METRIC_DISK.format('read_time_pct'), read_time_pct, tags=metric_tags)
self.rate(self.METRIC_DISK.format('write_time_pct'), write_time_pct, tags=metric_tags)
self.monotonic_count(self.METRIC_DISK.format('read_time'), disk.read_time, tags=metric_tags)
self.monotonic_count(self.METRIC_DISK.format('write_time'), disk.write_time, tags=metric_tags)
# FIXME: 8.x, metrics kept for backwards compatibility but are incorrect: the value is not a percentage
# See: https://github.com/DataDog/integrations-core/pull/7323#issuecomment-756427024
self.rate(self.METRIC_DISK.format('read_time_pct'), disk.read_time * 100 / 1000, tags=metric_tags)
self.rate(self.METRIC_DISK.format('write_time_pct'), disk.write_time * 100 / 1000, tags=metric_tags)
except AttributeError as e:
# Some OS don't return read_time/write_time fields
# http://psutil.readthedocs.io/en/latest/#psutil.disk_io_counters
Expand Down
2 changes: 2 additions & 0 deletions disk/metadata.csv
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name
system.disk.free,gauge,,byte,,The amount of disk space that is free.,1,system,disk free
system.disk.in_use,gauge,,fraction,,The amount of disk space in use as a fraction of the total.,-1,system,disk in use
system.disk.read_time,count,,millisecond,,The time in ms spent reading per device,0,system,disk read time
system.disk.read_time_pct,gauge,,percent,,Percent of time spent reading from disk.,0,system,disk read time pct
system.disk.total,gauge,,byte,,The total amount of disk space.,0,system,disk total
system.disk.used,gauge,,byte,,The amount of disk space in use.,-1,system,disk used
system.disk.write_time,count,,millisecond,,The time in ms spent writing per device.,0,system,disk write time
system.disk.write_time_pct,gauge,,percent,,Percent of time spent writing to disk.,0,system,disk write time pct
system.fs.inodes.free,gauge,,inode,,The number of free inodes.,1,system,inodes free
system.fs.inodes.in_use,gauge,,fraction,,The number of inodes in use as a fraction of the total.,-1,system,inodes in use
Expand Down
7 changes: 6 additions & 1 deletion disk/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from datadog_checks.dev.utils import ON_WINDOWS, mock_context_manager

from .metrics import CORE_GAUGES, CORE_RATES, UNIX_GAUGES
from .metrics import CORE_COUNTS, CORE_GAUGES, CORE_RATES, UNIX_GAUGES
from .mocks import (
MockDiskIOMetrics,
MockDiskMetrics,
Expand Down Expand Up @@ -67,3 +67,8 @@ def gauge_metrics():
@pytest.fixture(scope='session')
def rate_metrics():
return CORE_RATES


@pytest.fixture(scope='session')
def count_metrics():
return CORE_COUNTS
1 change: 1 addition & 0 deletions disk/tests/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Licensed under a 3-clause BSD style license (see LICENSE)
CORE_GAUGES = {'system.disk.total': 5, 'system.disk.used': 4, 'system.disk.free': 1, 'system.disk.in_use': 0.80}
CORE_RATES = {'system.disk.write_time_pct': 9.0, 'system.disk.read_time_pct': 5.0}
CORE_COUNTS = {'system.disk.write_time': 90.0, 'system.disk.read_time': 50.0}
UNIX_GAUGES = {
'system.fs.inodes.total': 10,
'system.fs.inodes.used': 1,
Expand Down
4 changes: 2 additions & 2 deletions disk/tests/test_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from datadog_checks.disk import Disk


def test_check(aggregator, instance_basic_volume, gauge_metrics, rate_metrics):
def test_check(aggregator, instance_basic_volume, gauge_metrics, rate_metrics, count_metrics):
"""
Basic check to see if all metrics are there
"""
c = Disk('disk', {}, [instance_basic_volume])
c.check(instance_basic_volume)

for name in chain(gauge_metrics, rate_metrics):
for name in chain(gauge_metrics, rate_metrics, count_metrics):
aggregator.assert_metric(name)

aggregator.assert_all_metrics_covered()
48 changes: 33 additions & 15 deletions disk/tests/test_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_bad_config():


@pytest.mark.usefixtures('psutil_mocks')
def test_default(aggregator, gauge_metrics, rate_metrics):
def test_default(aggregator, gauge_metrics, rate_metrics, count_metrics):
"""
Mock psutil and run the check
"""
Expand All @@ -65,16 +65,28 @@ def test_default(aggregator, gauge_metrics, rate_metrics):
tags = []

for name, value in iteritems(gauge_metrics):
aggregator.assert_metric(name, value=value, tags=tags)
aggregator.assert_metric(name, value=value, count=1, metric_type=aggregator.GAUGE, tags=tags)

for name, value in iteritems(rate_metrics):
aggregator.assert_metric(
name,
value=value,
count=1,
metric_type=aggregator.RATE,
tags=['device:{}'.format(DEFAULT_DEVICE_NAME), 'device_name:{}'.format(DEFAULT_DEVICE_BASE_NAME)],
)

aggregator.assert_all_metrics_covered()
for name, value in iteritems(count_metrics):
aggregator.assert_metric(
name,
value=value,
count=1,
metric_type=aggregator.MONOTONIC_COUNT,
tags=['device:{}'.format(DEFAULT_DEVICE_NAME), 'device_name:{}'.format(DEFAULT_DEVICE_BASE_NAME)],
)

aggregator.assert_all_metrics_covered()
aggregator.reset()


@pytest.mark.usefixtures('psutil_mocks')
Expand All @@ -90,7 +102,7 @@ def test_rw(aggregator):


@pytest.mark.usefixtures('psutil_mocks')
def test_use_mount(aggregator, instance_basic_mount, gauge_metrics, rate_metrics):
def test_use_mount(aggregator, instance_basic_mount, gauge_metrics, rate_metrics, count_metrics):
"""
Same as above, using mount to tag
"""
Expand All @@ -104,7 +116,7 @@ def test_use_mount(aggregator, instance_basic_mount, gauge_metrics, rate_metrics
tags=['device:{}'.format(DEFAULT_MOUNT_POINT), 'device_name:{}'.format(DEFAULT_DEVICE_BASE_NAME)],
)

for name, value in iteritems(rate_metrics):
for name, value in chain(iteritems(rate_metrics), iteritems(count_metrics)):
aggregator.assert_metric(
name,
value=value,
Expand All @@ -115,7 +127,7 @@ def test_use_mount(aggregator, instance_basic_mount, gauge_metrics, rate_metrics


@pytest.mark.usefixtures('psutil_mocks')
def test_device_tagging(aggregator, gauge_metrics, rate_metrics):
def test_device_tagging(aggregator, gauge_metrics, rate_metrics, count_metrics):
instance = {
'use_mount': 'no',
'device_tag_re': {'{}.*'.format(DEFAULT_DEVICE_NAME[:-1]): 'type:dev,tag:two'},
Expand Down Expand Up @@ -144,7 +156,7 @@ def test_device_tagging(aggregator, gauge_metrics, rate_metrics):
for name, value in iteritems(gauge_metrics):
aggregator.assert_metric(name, value=value, tags=tags)

for name, value in iteritems(rate_metrics):
for name, value in chain(iteritems(rate_metrics), iteritems(count_metrics)):
aggregator.assert_metric(
name,
value=value,
Expand Down Expand Up @@ -173,7 +185,7 @@ def test_get_devices_label():


@pytest.mark.usefixtures('psutil_mocks')
def test_min_disk_size(aggregator, gauge_metrics, rate_metrics):
def test_min_disk_size(aggregator, gauge_metrics, rate_metrics, count_metrics):
instance = {'min_disk_size': 0.001}
c = Disk('disk', {}, [instance])

Expand All @@ -189,18 +201,24 @@ def test_min_disk_size(aggregator, gauge_metrics, rate_metrics):
aggregator.assert_metric_has_tag(name, 'device:{}'.format(DEFAULT_DEVICE_NAME))
aggregator.assert_metric_has_tag(name, 'device_name:{}'.format(DEFAULT_DEVICE_BASE_NAME))

for name in count_metrics:
aggregator.assert_metric_has_tag(name, 'device:{}'.format(DEFAULT_DEVICE_NAME))
aggregator.assert_metric_has_tag(name, 'device_name:{}'.format(DEFAULT_DEVICE_BASE_NAME))

aggregator.assert_all_metrics_covered()


@pytest.mark.skipif(not Platform.is_linux(), reason='disk labels are only available on Linux')
@pytest.mark.usefixtures('psutil_mocks')
def test_labels_from_blkid_cache_file(aggregator, instance_blkid_cache_file, gauge_metrics, rate_metrics):
def test_labels_from_blkid_cache_file(
aggregator, instance_blkid_cache_file, gauge_metrics, rate_metrics, count_metrics
):
"""
Verify that the disk labels are set when the blkid_cache_file option is set
"""
c = Disk('disk', {}, [instance_blkid_cache_file])
c.check(instance_blkid_cache_file)
for metric in chain(gauge_metrics, rate_metrics):
for metric in chain(gauge_metrics, rate_metrics, count_metrics):
aggregator.assert_metric(
metric, tags=['device:/dev/sda1', 'device_name:sda1', 'label:MYLABEL', 'device_label:MYLABEL']
)
Expand All @@ -209,19 +227,19 @@ def test_labels_from_blkid_cache_file(aggregator, instance_blkid_cache_file, gau
@pytest.mark.skipif(not Platform.is_linux(), reason='disk labels are only available on Linux')
@pytest.mark.usefixtures('psutil_mocks')
def test_blkid_cache_file_contains_no_labels(
aggregator, instance_blkid_cache_file_no_label, gauge_metrics, rate_metrics
aggregator, instance_blkid_cache_file_no_label, gauge_metrics, rate_metrics, count_metrics
):
"""
Verify that the disk labels are ignored if the cache file doesn't contain any
"""
c = Disk('disk', {}, [instance_blkid_cache_file_no_label])
c.check(instance_blkid_cache_file_no_label)
for metric in chain(gauge_metrics, rate_metrics):
for metric in chain(gauge_metrics, rate_metrics, count_metrics):
aggregator.assert_metric(metric, tags=['device:/dev/sda1', 'device_name:sda1'])


@pytest.mark.usefixtures('psutil_mocks')
def test_timeout_config(aggregator, gauge_metrics, rate_metrics):
def test_timeout_config(aggregator):
"""Test timeout configuration value is used on every timeout on the check."""

# Arbitrary value
Expand All @@ -242,7 +260,7 @@ def no_timeout(fun):


@pytest.mark.usefixtures('psutil_mocks')
def test_timeout_warning(aggregator, gauge_metrics, rate_metrics):
def test_timeout_warning(aggregator, gauge_metrics, rate_metrics, count_metrics):
"""Test a warning is raised when there is a Timeout exception."""

# Raise exception for "/faulty" mountpoint
Expand Down Expand Up @@ -271,7 +289,7 @@ def f(mountpoint):
for name in gauge_metrics:
aggregator.assert_metric(name, count=0)

for name in rate_metrics:
for name in chain(rate_metrics, count_metrics):
aggregator.assert_metric_has_tag(name, 'device:{}'.format(DEFAULT_DEVICE_NAME))
aggregator.assert_metric_has_tag(name, 'device_name:{}'.format(DEFAULT_DEVICE_BASE_NAME))

Expand Down

0 comments on commit 00ba89c

Please sign in to comment.