Skip to content

Commit

Permalink
Remove unused apply_row_limits in statement_metrics.py (#9378)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexandre-normand authored May 19, 2021
1 parent 1b778e7 commit 944fed0
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 183 deletions.
121 changes: 0 additions & 121 deletions datadog_checks_base/datadog_checks/base/utils/db/statement_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,124 +131,3 @@ def _merge_duplicate_rows(rows, metrics, key):
queries_by_key[query_key] = merged_row

return list(queries_by_key.values())


def apply_row_limits(rows, metric_limits, tiebreaker_metric, tiebreaker_reverse, key):
"""
Given a list of query rows, apply limits ensuring that the top K and bottom K of each metric (columns)
are present. To increase the overlap of rows across metics with the same values (such as 0), the tiebreaker metric
is used as a second sort dimension.
The reason for this custom limit function on metrics is to guarantee that metric `top()` functions show the true
top and true bottom K, even if some limits are applied to drop less interesting queries that fall in the middle.
Longer Explanation of the Algorithm
-----------------------------------
Simply taking the top K and bottom K of all metrics is insufficient. For instance, for K=2 you might have rows
with values:
| query | count | time | errors |
| --------------------|------------|-------------|-------------|
| select * from dogs | 1 (bottom) | 10 (top) | 1 (top) |
| delete from dogs | 2 (bottom) | 8 (top) | 0 (top) |
| commit | 3 | 7 | 0 (bottom) |
| rollback | 4 | 3 | 0 (bottom) |
| select now() | 5 (top) | 2 (bottom) | 0 |
| begin | 6 (top) | 2 (bottom) | 0 |
If you only take the top 2 and bottom 2 values of each column and submit those metrics, then each query is
missing a lot of metrics:
| query | count | time | errors |
| --------------------|------------|-------------|-------------|
| select * from dogs | 1 | 10 | 1 |
| delete from dogs | 2 | 8 | 0 |
| commit | | | 0 |
| rollback | | | 0 |
| select now() | 5 | 2 | |
| begin | 6 | 2 | |
This is fine for showing only one metric, but if the user copies the query tag to find our more information,
that query should have all of the metrics because it is an "interesting" query.
To solve that, you can submit all metrics for all rows with at least on metric submitted, but then the worst-case
for total cardinality is:
(top K + bottom K) * metric count
Note that this only applies to one check run and a completely different set of "tied" metrics can be submitted on
the next check run. Since a large number of rows will have value '0', a tiebreaker is used to bias the selected
rows to rows already picked in the top K / bottom K for the tiebreaker.
| query | count | time | errors |
| --------------------|------------|-------------|-------------|
| select * from dogs | 1 | 10 | 1 |
| delete from dogs | 2 | 8 | 0 |
| commit | | | |
| rollback | | | |
| select now() | 5 | 2 | 0 | <-- biased toward top K count
| begin | 6 | 2 | 0 | <-- biased toward top K count
The queries `commit` and `rollback` were not interesting to keep; they were only selected because they have error
counts 0 (but so do the other queries). So we use the `count` as a tiebreaker to instead choose queries which are
interesting because they have higher execution counts.
- **rows** (_List[dict]_) - rows with columns as metrics
- **metric_limits** (_Dict[str,Tuple[int,int]]_) - dict of the top k and bottom k limits for each metric
ex:
>>> metric_limits = {
>>> 'count': (200, 50),
>>> 'time': (200, 100),
>>> 'lock_time': (50, 50),
>>> ...
>>> 'rows_sent': (100, 0),
>>> }
The first item in each tuple guarantees the top K rows will be chosen for this metric. The second item
guarantees the bottom K rows will also be chosen. Both of these numbers are configurable because you
may want to keep the top 100 slowest queries, but are only interested in the top 10 fastest queries.
That configuration would look like:
>>> metric_limits = {
>>> 'time': (100, 10), # Top 100, bottom 10
>>> ...
>>> }
- **tiebreaker_metric** (_str_) - metric used to resolve ties, intended to increase row overlap in different metrics
- **tiebreaker_reverse** (_bool_) - whether the tiebreaker metric should be in reverse order (descending)
- **key** (_callable_) - function for an ID which uniquely identifies a row
"""
if len(rows) == 0:
return rows

limited = dict()
available_cols = set(rows[0].keys())

for metric, (top_k, bottom_k) in metric_limits.items():
if metric not in available_cols:
continue
# sort_key uses a secondary sort dimension so that if there are a lot of
# the same values (like 0), then there will be more overlap in selected rows
# over time
if tiebreaker_reverse:

def sort_key(row):
return (row[metric], -row[tiebreaker_metric])

else:

def sort_key(row):
return (row[metric], row[tiebreaker_metric])

sorted_rows = sorted(rows, key=sort_key)

top = sorted_rows[len(sorted_rows) - top_k :]
bottom = sorted_rows[:bottom_k]
for row in top:
limited[key(row)] = row
for row in bottom:
limited[key(row)] = row

return list(limited.values())
63 changes: 1 addition & 62 deletions datadog_checks_base/tests/test_db_statements.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest

from datadog_checks.base.utils.db.statement_metrics import StatementMetrics, apply_row_limits
from datadog_checks.base.utils.db.statement_metrics import StatementMetrics


def add_to_dict(a, b):
Expand Down Expand Up @@ -181,64 +181,3 @@ def key(row):
]

assert expected_merged_metrics == metrics

def test_apply_row_limits(self):
def assert_any_order(a, b):
assert sorted(a, key=lambda row: row['_']) == sorted(b, key=lambda row: row['_'])

rows = [
{'_': 0, 'count': 2, 'time': 1000},
{'_': 1, 'count': 20, 'time': 5000},
{'_': 2, 'count': 20, 'time': 8000},
{'_': 3, 'count': 180, 'time': 8000},
{'_': 4, 'count': 0, 'time': 10},
{'_': 5, 'count': 60, 'time': 500},
{'_': 6, 'count': 90, 'time': 5000},
{'_': 7, 'count': 50, 'time': 5000},
{'_': 8, 'count': 40, 'time': 100},
{'_': 9, 'count': 30, 'time': 900},
{'_': 10, 'count': 80, 'time': 800},
{'_': 11, 'count': 110, 'time': 7000},
]
assert_any_order(
[], apply_row_limits(rows, {'count': (0, 0), 'time': (0, 0)}, 'count', True, key=lambda row: row['_'])
)

expected = [
{'_': 3, 'count': 180, 'time': 8000},
{'_': 4, 'count': 0, 'time': 10}, # The bottom 1 row for both 'count' and 'time'
{'_': 2, 'count': 20, 'time': 8000},
]
assert_any_order(
expected, apply_row_limits(rows, {'count': (1, 1), 'time': (1, 1)}, 'count', True, key=lambda row: row['_'])
)

expected = [
{'_': 5, 'count': 60, 'time': 500},
{'_': 10, 'count': 80, 'time': 800},
{'_': 6, 'count': 90, 'time': 5000},
{'_': 11, 'count': 110, 'time': 7000},
{'_': 3, 'count': 180, 'time': 8000},
{'_': 4, 'count': 0, 'time': 10},
{'_': 0, 'count': 2, 'time': 1000},
{'_': 2, 'count': 20, 'time': 8000},
{'_': 8, 'count': 40, 'time': 100},
]
assert_any_order(
expected, apply_row_limits(rows, {'count': (5, 2), 'time': (2, 2)}, 'count', True, key=lambda row: row['_'])
)

assert_any_order(
rows,
apply_row_limits(rows, {'count': (6, 6), 'time': (0, 0)}, 'time', False, key=lambda row: row['_']),
)

assert_any_order(
rows,
apply_row_limits(rows, {'count': (0, 0), 'time': (4, 8)}, 'time', False, key=lambda row: row['_']),
)

assert_any_order(
rows,
apply_row_limits(rows, {'count': (20, 20), 'time': (12, 5)}, 'time', False, key=lambda row: row['_']),
)

0 comments on commit 944fed0

Please sign in to comment.