diff --git a/docs/getting-started/concepts/dataset.md b/docs/getting-started/concepts/dataset.md index 9bdbbfffdf..59f7168905 100644 --- a/docs/getting-started/concepts/dataset.md +++ b/docs/getting-started/concepts/dataset.md @@ -43,4 +43,8 @@ Saved dataset can be later retrieved using `get_saved_dataset` method: ```python dataset = store.get_saved_dataset('my_training_dataset') dataset.to_df() -``` \ No newline at end of file +``` + +--- + +Check out our [tutorial on validating historical features](../../tutorials/validating-historical-features.md) to see how this concept can be applied in real-world use case. \ No newline at end of file diff --git a/docs/tutorials/tutorials-overview.md b/docs/tutorials/tutorials-overview.md index 86a8c25371..e28e5836f7 100644 --- a/docs/tutorials/tutorials-overview.md +++ b/docs/tutorials/tutorials-overview.md @@ -9,3 +9,5 @@ These Feast tutorials showcase how to use Feast to simplify end to end model tra {% page-ref page="real-time-credit-scoring-on-aws.md" %} {% page-ref page="driver-stats-using-snowflake.md" %} + +{% page-ref page="validating-historical-features.md" %} diff --git a/docs/tutorials/validating-historical-features.md b/docs/tutorials/validating-historical-features.md new file mode 100644 index 0000000000..8dcf82c011 --- /dev/null +++ b/docs/tutorials/validating-historical-features.md @@ -0,0 +1,910 @@ +# Data Quality Monitoring + +## Validating Historical Features with Great Expectations + +In this tutorial, we will use the public dataset of Chicago taxi trips to present data validation capabilities of Feast. The original dataset is stored in BigQuery and consists of raw data for each taxi trip (one row per trip) since 2013. We will generate several training datasets (aka historical features in Feast) for different periods and evaluate expectations made on one dataset against another. Our features will represent aggregations of raw data with daily intervals (eg, trips per day, average fare or speed for a specific day, etc.). We will craft some features using SQL while pulling data from BigQuery (like total trips time or total miles travelled). Another chunk of features will be implemented using Feast's on-demand transformations - features calculated on the fly when requested. + +Our plan: + +0. Prepare environment +1. Pull data from BigQuery (optional) +2. Declare & apply features and feature views in Feast +3. Generate reference dataset +4. Develop & test profiler function +5. Run validation on different dataset using reference dataset & profiler + + +> The original notebook and datasets for this tutorial can be found on [GitHub](https://github.com/feast-dev/dqm-tutorial). + +### 0. Setup + +Install Feast Python SDK and great expectations: + + +```python +!pip install 'feast[ge]' +``` + + +### 1. Dataset preparation (Optional) + +**You can skip this step if you don't have GCP account. Please use parquet files that are coming with this tutorial instead** + + +```python +!pip install google-cloud-bigquery +``` + + +```python +import pyarrow.parquet + +from google.cloud.bigquery import Client +``` + + +```python +bq_client = Client(project='kf-feast') +``` + +Running some basic aggregations while pulling data from BigQuery. Grouping by taxi_id and day: + + +```python +data_query = """SELECT + taxi_id, + TIMESTAMP_TRUNC(trip_start_timestamp, DAY) as day, + SUM(trip_miles) as total_miles_travelled, + SUM(trip_seconds) as total_trip_seconds, + SUM(fare) as total_earned, + COUNT(*) as trip_count +FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` +WHERE + trip_miles > 0 AND trip_seconds > 60 AND + trip_start_timestamp BETWEEN '2019-01-01' and '2020-12-31' AND + trip_total < 1000 +GROUP BY taxi_id, TIMESTAMP_TRUNC(trip_start_timestamp, DAY)""" +``` + + +```python +driver_stats_table = bq_client.query(data_query).to_arrow() + +# Storing resulting dataset into parquet file +pyarrow.parquet.write_table(driver_stats_table, "trips_stats.parquet") +``` + + +```python +def entities_query(year): + return f"""SELECT + distinct taxi_id +FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` +WHERE + trip_miles > 0 AND trip_seconds > 0 AND + trip_start_timestamp BETWEEN '{year}-01-01' and '{year}-12-31' +""" +``` + + +```python +entities_2019_table = bq_client.query(entities_query(2019)).to_arrow() + +# Storing entities (taxi ids) into parquet file +pyarrow.parquet.write_table(entities_2019_table, "entities.parquet") +``` + + +## 2. Declaring features + + +```python +import pyarrow.parquet +import pandas as pd + +from feast import Feature, FeatureView, Entity, FeatureStore +from feast.value_type import ValueType +from feast.data_format import ParquetFormat +from feast.on_demand_feature_view import on_demand_feature_view +from feast.infra.offline_stores.file_source import FileSource +from feast.infra.offline_stores.file import SavedDatasetFileStorage + +from google.protobuf.duration_pb2 import Duration +``` + + +```python +batch_source = FileSource( + event_timestamp_column="day", + path="trips_stats.parquet", # using parquet file that we created on previous step + file_format=ParquetFormat() +) +``` + + +```python +taxi_entity = Entity(name='taxi', join_key='taxi_id') +``` + + +```python +trips_stats_fv = FeatureView( + name='trip_stats', + entities=['taxi'], + features=[ + Feature("total_miles_travelled", ValueType.DOUBLE), + Feature("total_trip_seconds", ValueType.DOUBLE), + Feature("total_earned", ValueType.DOUBLE), + Feature("trip_count", ValueType.INT64), + + ], + ttl=Duration(seconds=86400), + batch_source=batch_source, +) +``` + +*Read more about feature views in [Feast docs](https://docs.feast.dev/getting-started/concepts/feature-view)* + + +```python +@on_demand_feature_view( + features=[ + Feature("avg_fare", ValueType.DOUBLE), + Feature("avg_speed", ValueType.DOUBLE), + Feature("avg_trip_seconds", ValueType.DOUBLE), + Feature("earned_per_hour", ValueType.DOUBLE), + ], + inputs={ + "stats": trips_stats_fv + } +) +def on_demand_stats(inp): + out = pd.DataFrame() + out["avg_fare"] = inp["total_earned"] / inp["trip_count"] + out["avg_speed"] = 3600 * inp["total_miles_travelled"] / inp["total_trip_seconds"] + out["avg_trip_seconds"] = inp["total_trip_seconds"] / inp["trip_count"] + out["earned_per_hour"] = 3600 * inp["total_earned"] / inp["total_trip_seconds"] + return out +``` + +*Read more about on demand feature views [here](https://docs.feast.dev/reference/alpha-on-demand-feature-view)* + + +```python +store = FeatureStore(".") # using feature_store.yaml that stored in the same directory +``` + + +```python +store.apply([taxi_entity, trips_stats_fv, on_demand_stats]) # writing to the registry +``` + + +## 3. Generating training (reference) dataset + + +```python +taxi_ids = pyarrow.parquet.read_table("entities.parquet").to_pandas() +``` + +Generating range of timestamps with daily frequency: + + +```python +timestamps = pd.DataFrame() +timestamps["event_timestamp"] = pd.date_range("2019-06-01", "2019-07-01", freq='D') +``` + +Cross merge (aka relation multiplication) produces entity dataframe with each taxi_id repeated for each timestamp: + + +```python +entity_df = pd.merge(taxi_ids, timestamps, how='cross') +entity_df +``` + + + + +
+ | taxi_id | +event_timestamp | +
---|---|---|
0 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2019-06-01 | +
1 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2019-06-02 | +
2 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2019-06-03 | +
3 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2019-06-04 | +
4 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2019-06-05 | +
... | +... | +... | +
156979 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2019-06-27 | +
156980 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2019-06-28 | +
156981 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2019-06-29 | +
156982 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2019-06-30 | +
156983 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2019-07-01 | +
156984 rows × 2 columns
++ | total_earned | +avg_trip_seconds | +taxi_id | +total_miles_travelled | +trip_count | +earned_per_hour | +event_timestamp | +total_trip_seconds | +avg_fare | +avg_speed | +
---|---|---|---|---|---|---|---|---|---|---|
0 | +68.25 | +2270.000000 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +24.70 | +2.0 | +54.118943 | +2019-06-01 00:00:00+00:00 | +4540.0 | +34.125000 | +19.585903 | +
1 | +221.00 | +560.500000 | +7a4a6162eaf27805aef407d25d5cb21fe779cd962922cb... | +54.18 | +24.0 | +59.143622 | +2019-06-01 00:00:00+00:00 | +13452.0 | +9.208333 | +14.499554 | +
2 | +160.50 | +1010.769231 | +f4c9d05b215d7cbd08eca76252dae51cdb7aca9651d4ef... | +41.30 | +13.0 | +43.972603 | +2019-06-01 00:00:00+00:00 | +13140.0 | +12.346154 | +11.315068 | +
3 | +183.75 | +697.550000 | +c1f533318f8480a59173a9728ea0248c0d3eb187f4b897... | +37.30 | +20.0 | +47.415956 | +2019-06-01 00:00:00+00:00 | +13951.0 | +9.187500 | +9.625116 | +
4 | +217.75 | +1054.076923 | +455b6b5cae6ca5a17cddd251485f2266d13d6a2c92f07c... | +69.69 | +13.0 | +57.206451 | +2019-06-01 00:00:00+00:00 | +13703.0 | +16.750000 | +18.308692 | +
... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +... | +
156979 | +38.00 | +1980.000000 | +0cccf0ec1f46d1e0beefcfdeaf5188d67e170cdff92618... | +14.90 | +1.0 | +69.090909 | +2019-07-01 00:00:00+00:00 | +1980.0 | +38.000000 | +27.090909 | +
156980 | +135.00 | +551.250000 | +beefd3462e3f5a8e854942a2796876f6db73ebbd25b435... | +28.40 | +16.0 | +55.102041 | +2019-07-01 00:00:00+00:00 | +8820.0 | +8.437500 | +11.591837 | +
156981 | +NaN | +NaN | +9a3c52aa112f46cf0d129fafbd42051b0fb9b0ff8dcb0e... | +NaN | +NaN | +NaN | +2019-07-01 00:00:00+00:00 | +NaN | +NaN | +NaN | +
156982 | +63.00 | +815.000000 | +08308c31cd99f495dea73ca276d19a6258d7b4c9c88e43... | +19.96 | +4.0 | +69.570552 | +2019-07-01 00:00:00+00:00 | +3260.0 | +15.750000 | +22.041718 | +
156983 | +NaN | +NaN | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +NaN | +NaN | +NaN | +2019-07-01 00:00:00+00:00 | +NaN | +NaN | +NaN | +
156984 rows × 10 columns
++ | taxi_id | +event_timestamp | +
---|---|---|
0 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2020-12-01 | +
1 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2020-12-02 | +
2 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2020-12-03 | +
3 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2020-12-04 | +
4 | +91d5288487e87c5917b813ba6f75ab1c3a9749af906a2d... | +2020-12-05 | +
... | +... | +... | +
35443 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2020-12-03 | +
35444 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2020-12-04 | +
35445 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2020-12-05 | +
35446 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2020-12-06 | +
35447 | +7ebf27414a0c7b128e7925e1da56d51a8b81484f7630cf... | +2020-12-07 | +
35448 rows × 2 columns
+