Remove unused apply_row_limits in statement_metrics.py (#9378)

DataDog · May 19, 2021 · 944fed0 · 944fed0
1 parent 1b778e7
commit 944fed0
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 183 deletions.
diff --git a/datadog_checks_base/datadog_checks/base/utils/db/statement_metrics.py b/datadog_checks_base/datadog_checks/base/utils/db/statement_metrics.py
@@ -131,124 +131,3 @@ def _merge_duplicate_rows(rows, metrics, key):
             queries_by_key[query_key] = merged_row
 
     return list(queries_by_key.values())
-
-
-def apply_row_limits(rows, metric_limits, tiebreaker_metric, tiebreaker_reverse, key):
-    """
-    Given a list of query rows, apply limits ensuring that the top K and bottom K of each metric (columns)
-    are present. To increase the overlap of rows across metics with the same values (such as 0), the tiebreaker metric
-    is used as a second sort dimension.
-
-    The reason for this custom limit function on metrics is to guarantee that metric `top()` functions show the true
-    top and true bottom K, even if some limits are applied to drop less interesting queries that fall in the middle.
-
-    Longer Explanation of the Algorithm
-    -----------------------------------
-
-    Simply taking the top K and bottom K of all metrics is insufficient. For instance, for K=2 you might have rows
-    with values:
-
-        | query               | count      | time        | errors      |
-        | --------------------|------------|-------------|-------------|
-        | select * from dogs  | 1 (bottom) | 10 (top)    |  1 (top)    |
-        | delete from dogs    | 2 (bottom) |  8 (top)    |  0 (top)    |
-        | commit              | 3          |  7          |  0 (bottom) |
-        | rollback            | 4          |  3          |  0 (bottom) |
-        | select now()        | 5 (top)    |  2 (bottom) |  0          |
-        | begin               | 6 (top)    |  2 (bottom) |  0          |
-
-    If you only take the top 2 and bottom 2 values of each column and submit those metrics, then each query is
-    missing a lot of metrics:
-
-        | query               | count      | time        | errors      |
-        | --------------------|------------|-------------|-------------|
-        | select * from dogs  | 1          | 10          |  1          |
-        | delete from dogs    | 2          |  8          |  0          |
-        | commit              |            |             |  0          |
-        | rollback            |            |             |  0          |
-        | select now()        | 5          |  2          |             |
-        | begin               | 6          |  2          |             |
-
-    This is fine for showing only one metric, but if the user copies the query tag to find our more information,
-    that query should have all of the metrics because it is an "interesting" query.
-
-    To solve that, you can submit all metrics for all rows with at least on metric submitted, but then the worst-case
-    for total cardinality is:
-
-        (top K + bottom K) * metric count
-
-    Note that this only applies to one check run and a completely different set of "tied" metrics can be submitted on
-    the next check run. Since a large number of rows will have value '0', a tiebreaker is used to bias the selected
-    rows to rows already picked in the top K / bottom K for the tiebreaker.
-
-
-        | query               | count      | time        | errors      |
-        | --------------------|------------|-------------|-------------|
-        | select * from dogs  | 1          | 10          |  1          |
-        | delete from dogs    | 2          |  8          |  0          |
-        | commit              |            |             |             |
-        | rollback            |            |             |             |
-        | select now()        | 5          |  2          |  0          | <-- biased toward top K count
-        | begin               | 6          |  2          |  0          | <-- biased toward top K count
-
-    The queries `commit` and `rollback` were not interesting to keep; they were only selected because they have error
-    counts 0 (but so do the other queries). So we use the `count` as a tiebreaker to instead choose queries which are
-    interesting because they have higher execution counts.
-
-    - **rows** (_List[dict]_) - rows with columns as metrics
-    - **metric_limits** (_Dict[str,Tuple[int,int]]_) - dict of the top k and bottom k limits for each metric
-            ex:
-            >>> metric_limits = {
-            >>>     'count': (200, 50),
-            >>>     'time': (200, 100),
-            >>>     'lock_time': (50, 50),
-            >>>     ...
-            >>>     'rows_sent': (100, 0),
-            >>> }
-
-            The first item in each tuple guarantees the top K rows will be chosen for this metric. The second item
-            guarantees the bottom K rows will also be chosen. Both of these numbers are configurable because you
-            may want to keep the top 100 slowest queries, but are only interested in the top 10 fastest queries.
-            That configuration would look like:
-
-            >>> metric_limits = {
-            >>>     'time': (100, 10),  # Top 100, bottom 10
-            >>>     ...
-            >>> }
-
-    - **tiebreaker_metric** (_str_) - metric used to resolve ties, intended to increase row overlap in different metrics
-    - **tiebreaker_reverse** (_bool_) - whether the tiebreaker metric should be in reverse order (descending)
-    - **key** (_callable_) - function for an ID which uniquely identifies a row
-    """
-    if len(rows) == 0:
-        return rows
-
-    limited = dict()
-    available_cols = set(rows[0].keys())
-
-    for metric, (top_k, bottom_k) in metric_limits.items():
-        if metric not in available_cols:
-            continue
-        # sort_key uses a secondary sort dimension so that if there are a lot of
-        # the same values (like 0), then there will be more overlap in selected rows
-        # over time
-        if tiebreaker_reverse:
-
-            def sort_key(row):
-                return (row[metric], -row[tiebreaker_metric])
-
-        else:
-
-            def sort_key(row):
-                return (row[metric], row[tiebreaker_metric])
-
-        sorted_rows = sorted(rows, key=sort_key)
-
-        top = sorted_rows[len(sorted_rows) - top_k :]
-        bottom = sorted_rows[:bottom_k]
-        for row in top:
-            limited[key(row)] = row
-        for row in bottom:
-            limited[key(row)] = row
-
-    return list(limited.values())
diff --git a/datadog_checks_base/tests/test_db_statements.py b/datadog_checks_base/tests/test_db_statements.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from datadog_checks.base.utils.db.statement_metrics import StatementMetrics, apply_row_limits
+from datadog_checks.base.utils.db.statement_metrics import StatementMetrics
 
 
 def add_to_dict(a, b):
@@ -181,64 +181,3 @@ def key(row):
         ]
 
         assert expected_merged_metrics == metrics
-
-    def test_apply_row_limits(self):
-        def assert_any_order(a, b):
-            assert sorted(a, key=lambda row: row['_']) == sorted(b, key=lambda row: row['_'])
-
-        rows = [
-            {'_': 0, 'count': 2, 'time': 1000},
-            {'_': 1, 'count': 20, 'time': 5000},
-            {'_': 2, 'count': 20, 'time': 8000},
-            {'_': 3, 'count': 180, 'time': 8000},
-            {'_': 4, 'count': 0, 'time': 10},
-            {'_': 5, 'count': 60, 'time': 500},
-            {'_': 6, 'count': 90, 'time': 5000},
-            {'_': 7, 'count': 50, 'time': 5000},
-            {'_': 8, 'count': 40, 'time': 100},
-            {'_': 9, 'count': 30, 'time': 900},
-            {'_': 10, 'count': 80, 'time': 800},
-            {'_': 11, 'count': 110, 'time': 7000},
-        ]
-        assert_any_order(
-            [], apply_row_limits(rows, {'count': (0, 0), 'time': (0, 0)}, 'count', True, key=lambda row: row['_'])
-        )
-
-        expected = [
-            {'_': 3, 'count': 180, 'time': 8000},
-            {'_': 4, 'count': 0, 'time': 10},  # The bottom 1 row for both 'count' and 'time'
-            {'_': 2, 'count': 20, 'time': 8000},
-        ]
-        assert_any_order(
-            expected, apply_row_limits(rows, {'count': (1, 1), 'time': (1, 1)}, 'count', True, key=lambda row: row['_'])
-        )
-
-        expected = [
-            {'_': 5, 'count': 60, 'time': 500},
-            {'_': 10, 'count': 80, 'time': 800},
-            {'_': 6, 'count': 90, 'time': 5000},
-            {'_': 11, 'count': 110, 'time': 7000},
-            {'_': 3, 'count': 180, 'time': 8000},
-            {'_': 4, 'count': 0, 'time': 10},
-            {'_': 0, 'count': 2, 'time': 1000},
-            {'_': 2, 'count': 20, 'time': 8000},
-            {'_': 8, 'count': 40, 'time': 100},
-        ]
-        assert_any_order(
-            expected, apply_row_limits(rows, {'count': (5, 2), 'time': (2, 2)}, 'count', True, key=lambda row: row['_'])
-        )
-
-        assert_any_order(
-            rows,
-            apply_row_limits(rows, {'count': (6, 6), 'time': (0, 0)}, 'time', False, key=lambda row: row['_']),
-        )
-
-        assert_any_order(
-            rows,
-            apply_row_limits(rows, {'count': (0, 0), 'time': (4, 8)}, 'time', False, key=lambda row: row['_']),
-        )
-
-        assert_any_order(
-            rows,
-            apply_row_limits(rows, {'count': (20, 20), 'time': (12, 5)}, 'time', False, key=lambda row: row['_']),
-        )