Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dbt metrics submodule #28

Merged
merged 6 commits into from
May 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@
[submodule "dbt-date"]
path = dbt-date
url = https://github.com/calogica/dbt-date.git
[submodule "dbt_metrics"]
path = dbt_metrics
url = https://github.com/dbt-labs/dbt_metrics.git
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ dbt-trino-tests:
./docker/init_trino.bash
./docker/run_dbt_utils_integration_tests.bash
./docker/run_dbt_date_integration_tests.bash
./docker/run_dbt_metrics_integration_tests.bash
./docker/remove_trino.bash

dbt-starburst-tests:
Expand All @@ -12,4 +13,5 @@ dbt-starburst-tests:
./docker/init_starburst.bash
./docker/run_dbt_utils_integration_tests.bash
./docker/run_dbt_date_integration_tests.bash
./docker/run_dbt_metrics_integration_tests.bash
./docker/remove_starburst.bash
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ To make use of these trino adaptations in your dbt project, you must do two thin
search_order: ['trino_utils', 'dbt_utils']
- macro_namespace: dbt_date
search_order: ['trino_utils', 'dbt_date']
- macro_namespace: metrics
search_order: ['trino_utils', 'metrics']
```
Check [dbt Hub](https://hub.getdbt.com) for the latest installation
instructions, or [read the docs](https://docs.getdbt.com/docs/package-management)
Expand Down
1 change: 1 addition & 0 deletions dbt_metrics
Submodule dbt_metrics added at 5897ce
2 changes: 1 addition & 1 deletion docker-compose-starburst.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ services:
trino:
ports:
- "8080:8080"
image: "starburstdata/starburst-enterprise:397-e"
image: "starburstdata/starburst-enterprise:407-e.6"
volumes:
- ./docker/starburst/etc:/etc/starburst
- ./docker/starburst/catalog:/etc/starburst/catalog
Expand Down
2 changes: 1 addition & 1 deletion docker-compose-trino.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ services:
trino:
ports:
- "8080:8080"
image: "trinodb/trino:397"
image: "trinodb/trino:417"
volumes:
- ./docker/trino/etc:/usr/lib/trino/etc:ro
- ./docker/trino/catalog:/etc/trino/catalog
Expand Down
7 changes: 0 additions & 7 deletions docker/Dockerfile.util

This file was deleted.

3 changes: 1 addition & 2 deletions docker/init_starburst.bash
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,5 @@ cd ..
set -exo pipefail

docker-compose -f docker-compose-starburst.yml build
docker-compose -f docker/util.yml build
docker-compose -f docker-compose-starburst.yml up -d
docker-compose -f docker/util.yml run --rm util wait_for_up trino 8080 10
timeout 5m bash -c -- 'while ! docker-compose -f docker-compose-starburst.yml logs trino 2>&1 | tail -n 1 | grep "SERVER STARTED"; do sleep 2; done'
3 changes: 1 addition & 2 deletions docker/init_trino.bash
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,5 @@ cd ..
set -exo pipefail

docker-compose -f docker-compose-trino.yml build
docker-compose -f docker/util.yml build
docker-compose -f docker-compose-trino.yml up -d
docker-compose -f docker/util.yml run --rm util wait_for_up trino 8080
timeout 5m bash -c -- 'while ! docker-compose -f docker-compose-trino.yml logs trino 2>&1 | tail -n 1 | grep "SERVER STARTED"; do sleep 2; done'
15 changes: 15 additions & 0 deletions docker/run_dbt_metrics_integration_tests.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

# move to wherever we are so docker things work
cd "$(dirname "${BASH_SOURCE[0]}")"

set -exo pipefail
docker run \
--network="dbt-net" \
-v $PWD/dbt:/root/.dbt \
dbt-trino-utils \
"cd /opt/dbt_trino_utils/integration_tests/dbt_metrics \
&& dbt deps \
&& dbt seed \
&& dbt run \
&& dbt test"
12 changes: 0 additions & 12 deletions docker/util.yml

This file was deleted.

18 changes: 0 additions & 18 deletions docker/wait_for_up.bash

This file was deleted.

81 changes: 81 additions & 0 deletions integration_tests/dbt_metrics/dbt_project.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: "trino_utils_dbt_metrics_integration_tests"
version: "1.0.0"
config-version: 2

profile: "integration_tests"

model-paths: ["models"]
analysis-paths: ["analyses"]
test-paths: ["tests"]
seed-paths: ["seeds"]
macro-paths: ["macros"]
snapshot-paths: ["snapshots"]

target-path: "target"
clean-targets:
- "target"
- "dbt_packages"
- "logs"

dispatch:
- macro_namespace: metrics
search_order: ['trino_utils_dbt_metrics_integration_tests', 'trino_utils', 'metrics']

models:

trino_utils_dbt_metrics_integration_tests:
metric_testing_models:
+materialized: table

dbt_metrics_integration_tests:

# Overridden by trino__custom_calendar
custom_calendar:
+enabled: false

metric_testing_models:
+materialized: table

# no median function in Trino

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we could use other function to calculate median WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could use approx_percentile function. But in order to calculate median for odd and even number of values in the column, we need to add workaround, as using just approx_percentile(col_name, 0.50) for even number of values in the column is not working (workaround source):

Create sample table with even nr of rows:

CREATE TABLE memory.default.test_table AS (
select 1 AS col_name
union all
select 2 AS col_name
);

Calculate median:

SELECT
IF (
    COUNT(*) % 2 = 0,
    (approx_percentile(col_name, 0.499999) + approx_percentile(col_name, 0.50))/2.0,
    approx_percentile(col_name, 0.50)
) AS median_value
FROM memory.default.test_table;

So, it is a bit of code to calculate median in Trino. For sure the best way would be to implement MEDIAN in Trino. There is issue for that, but it is very stale and gained no attention trinodb/trino#6309
WDYT?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just create a follow-up issue and include this code.

Copy link

@SeungHuLee SeungHuLee Jun 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anything wrong with getting median by value_at_quantile(tdigest_agg(col_name), 0.5)

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SeungHuLee your solution has the same downside as using approx_percentile alone - it doesn't work if number of rows is even, and two middle values are not the same numbers. In this case, function should return arithmetic mean of these middle values. In your solution, it returns the bigger value.

So, running this query on table which I defined in my previous comment:

SELECT
value_at_quantile(tdigest_agg(col_name), 0.5) as median_value
FROM memory.default.test_table;

returns 2.0, but correct result is 1.5.

That's the reason why above mentioned workaround is needed.

Copy link

@SeungHuLee SeungHuLee Jun 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@damian3031 Thanks for detailed answer! I'm not really a statistics expert, so I was just using tdigest methods according to following:
trinodb/trino#5158
trinodb/trino#4975
https://arxiv.org/pdf/1902.04023.pdf
By the way, when I use the workaround suggested above, for me the median result changes slightly everytime I execute the same query.
Is there more stable or accurate solution suggested?

base_median_metric:
+enabled: false
base_median_metric_no_time_grain:
+enabled: false

# no 'is true' predicate in trino
base_count_distinct_metric:
+enabled: false
derived_metric:
+enabled: false

# Overridden by trino__develop_metric
develop_metric:
+enabled: false
# Overridden by trino__simple_develop_metric
simple_develop_metric:
+enabled: false

# TODO: Fix and enable
hovaesco marked this conversation as resolved.
Show resolved Hide resolved
base_count_metric__secondary_calculations:
+enabled: false
base_sum_metric__prior:
+enabled: false
multiple_metrics__period_over_period:
+enabled: false
multiple_metrics__period_to_date:
+enabled: false
multiple_metrics__rolling:
+enabled: false
# issue with base_sum_metric.yml
# with config: restrict_no_time_grain
base_sum_metric:
+enabled: false
ratio_metric:
+enabled: false

materialized_models:
+materialized: table

vars:
dbt_metrics_calendar_model: trino__custom_calendar
custom_calendar_dimension_list: ["is_weekend"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
select
*
,round(order_total - (order_total/2)) as discount_total
from {{ref('trino__fact_orders_source')}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 2
metrics:
- name: base_average_metric
model: ref('trino__fact_orders')
label: Total Discount ($)
timestamp: order_date
time_grains: [day, week, month, test]
calculation_method: average
expression: discount_total
dimensions:
- had_discount
- order_country
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
version: 2

metrics:
- name: base_count_distinct_metric
model: ref('trino__fact_orders')
label: Count Distinct
timestamp: order_date
time_grains: [day, week, month]
calculation_method: count_distinct
expression: customer_id
dimensions:
- had_discount
- order_country
window:
count: 14
period: month
filters:
- field: had_discount
operator: 'is'
value: 'true'
- field: order_country
operator: '='
value: "'CA'"
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 2
metrics:
- name: base_count_metric
model: ref('trino__fact_orders')
label: Total Discount ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: count
expression: order_total
dimensions:
- had_discount
- order_country
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
version: 2
metrics:
- name: base_median_metric
model: ref('trino__fact_orders')
label: Total Discount ($)
timestamp: order_date
time_grains: [day, week, month, all_time]
calculation_method: median
expression: discount_total
dimensions:
- had_discount
- order_country
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
version: 2
metrics:
- name: base_sum_metric
model: ref('trino__fact_orders')
label: Order Total ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: sum
expression: order_total
dimensions:
- had_discount
- order_country
config:
restrict_no_time_grain: True

- name: base_sum_metric_duplicate
model: ref('fact_orders_duplicate')
label: Order Total ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: sum
expression: order_total
dimensions:
- had_discount
- order_country

- name: base_sum_metric__14_day_window
model: ref('trino__fact_orders')
label: Order Total ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: sum
expression: order_total
window:
count: 14
period: month
dimensions:
- had_discount
- order_country

- name: base_test_metric
model: ref('fact_orders')
label: Order Total ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: sum
expression: order_total
dimensions:
- had_discount
- order_country

- name: base_sum_metric__no_timestamp
model: ref('fact_orders')
label: Order Total ($)
calculation_method: sum
expression: order_total
dimensions:
- had_discount
- order_country
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: 2
metrics:
- name: case_when_metric
model: ref('trino__fact_orders')
label: Order Total ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: sum
expression: case when had_discount = true then 1 else 0 end
dimensions:
- order_country
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{% set my_metric_yml -%}
{% raw %}

metrics:
- name: develop_metric
model: ref('trino__fact_orders')
label: Total Discount ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: average
expression: discount_total
dimensions:
- had_discount
- order_country

- name: derived_metric
label: Total Discount ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: derived
expression: "{{ metric('develop_metric') }} - 1 "
dimensions:
- had_discount
- order_country

- name: some_other_metric_not_using
label: Total Discount ($)
timestamp: order_date
time_grains: [day, week, month]
calculation_method: derived
expression: "{{ metric('derived_metric') }} - 1 "
dimensions:
- had_discount
- order_country

{% endraw %}
{%- endset %}

select *
from {{ metrics.develop(
develop_yml=my_metric_yml,
metric_list=['derived_metric'],
grain='month'
)
}}
Loading