diff --git a/.github/workflows/ghcr_push.yml b/.github/workflows/ghcr_push.yml index 025d5717c..d5b83e6c0 100644 --- a/.github/workflows/ghcr_push.yml +++ b/.github/workflows/ghcr_push.yml @@ -47,6 +47,8 @@ jobs: path: case_studies/ml_training - name: eda path: case_studies/feature_engineering + - name: feast_integration + path: case_studies/feature_engineering steps: - uses: actions/checkout@v2 with: @@ -64,4 +66,4 @@ jobs: registry: ghcr.io build_extra_args: "--compress=true --build-arg=tag=ghcr.io/${{ github.repository_owner }}/flytecookbook:${{ matrix.directory.name }}-${{ github.sha }}" context: ./cookbook/${{ matrix.directory.path }} - dockerfile: ${{ matrix.directory.name }}/Dockerfile \ No newline at end of file + dockerfile: ${{ matrix.directory.name }}/Dockerfile diff --git a/cookbook/case_studies/feature_engineering/feast_integration/Dockerfile b/cookbook/case_studies/feature_engineering/feast_integration/Dockerfile new file mode 100644 index 000000000..929b7f632 --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/Dockerfile @@ -0,0 +1,37 @@ +FROM python:3.8-buster + +WORKDIR /root +ENV VENV /opt/venv +ENV LANG C.UTF-8 +ENV LC_ALL C.UTF-8 +ENV PYTHONPATH "$PYTHONPATH:/root/feast_integration" + +# Install the AWS cli separately to prevent issues with boto being written over +RUN pip3 install awscli + +# Virtual environment +RUN python3 -m venv ${VENV} +RUN ${VENV}/bin/pip install wheel +ENV PATH="${VENV}/bin:$PATH" + +# Install Python dependencies +COPY feast_integration/requirements.txt /root/. +RUN ${VENV}/bin/pip install -r /root/requirements.txt + +COPY feast_integration/sandbox.config /root/ +COPY in_container.mk /root/Makefile + +# Copy the actual co +COPY feast_integration/ /root/feast_integration/ + +# This tag is supplied by the build script and will be used to determine the version +# when registering tasks, workflows, and launch plans +ARG tag +ENV FLYTE_INTERNAL_IMAGE $tag + +# Copy over the helper script that the SDK relies on +RUN cp ${VENV}/bin/flytekit_venv /usr/local/bin/ +RUN chmod a+x /usr/local/bin/flytekit_venv + +# Enable the virtualenv for this image. Note this relies on the VENV variable we've set in this image. +ENTRYPOINT ["/usr/local/bin/flytekit_venv"] diff --git a/cookbook/case_studies/feature_engineering/feast_integration/Makefile b/cookbook/case_studies/feature_engineering/feast_integration/Makefile new file mode 100644 index 000000000..1e9c73693 --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/Makefile @@ -0,0 +1,3 @@ +PREFIX=feast +include ../../../common/common.mk +include ../../../common/leaf.mk diff --git a/cookbook/case_studies/feature_engineering/feast_integration/README.rst b/cookbook/case_studies/feature_engineering/feast_integration/README.rst new file mode 100644 index 000000000..a65a09960 --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/README.rst @@ -0,0 +1,66 @@ +Feast Integration +----------------- + +**Feature Engineering** off-late has become one of the most prominent topics in Machine Learning. +It is the process of transforming raw data into features that better represent the underlying problem to the predictive models, resulting in improved model accuracy on unseen data. + +** `Feast`_ is an operational data system for managing and serving machine learning features to models in production.** + +Flyte provides a way to train models and perform feature engineering as a single pipeline. +But, it provides no way to serve these features to production when the model matures and is ready to be served in production. + +Flyte adds the capability of engineering the features and Feast provides the feature registry and online serving system. One thing that Flyte makes possible is incremental development of features and only turning on the sync to online stores when you are confident about the features + +In this tutorial, we'll walk through how Feast can be used to store and retrieve features to train and test the model curated using the Flyte pipeline. + +Dataset +======= +We'll be using the horse colic dataset wherein we'll determine if the lesion of the horse is surgical or not. This is a modified version of the original dataset. + +The dataset will have the following columns: + +.. list-table:: Horse Colic Features + :widths: 25 25 25 25 25 + + * - surgery + - Age + - Hospital Number + - rectal temperature + - pulse + * - respiratory rate + - temperature of extremities + - peripheral pulse + - mucous membranes + - capillary refill time + * - pain + - peristalsis + - abdominal distension + - nasogastric tube + - nasogastric reflux + * - nasogastric reflux PH + - rectal examination + - abdomen + - packed cell volume + - total protein + * - abdominocentesis appearance + - abdomcentesis total protein + - outcome + - surgical lesion + - timestamp + +The horse colic dataset will be a compressed zip file consisting of the SQLite DB. For this example we just wanted a dataset that was available online, but this could be easily plugged into another dataset / data management system like Snowflake, Athena, Hive, BigQuery or Spark, all of those are supported by Flyte. + +Takeaways +========= +The example we're trying to demonstrate is a simple feature engineering job that you can seamlessly construct with Flyte. Here's what the nitty-gritties are: + +#. Source data is from SQL-like data sources +#. Procreated feature transforms +#. Ability to create a low-code platform +#. Feast integration +#. Serve features to production using Feast +#. TaskTemplate within an imperative workflow + +.. tip:: + + If you're a data scientist, you needn't worry about the infrastructure overhead. Flyte provides an easy-to-use interface which looks just like a typical library. diff --git a/cookbook/case_studies/feature_engineering/feast_integration/__init__.py b/cookbook/case_studies/feature_engineering/feast_integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cookbook/case_studies/feature_engineering/feast_integration/custom_provider/provider.py b/cookbook/case_studies/feature_engineering/feast_integration/custom_provider/provider.py new file mode 100644 index 000000000..b124edab4 --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/custom_provider/provider.py @@ -0,0 +1,86 @@ +from datetime import datetime +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union + +import pandas +from feast.entity import Entity +from feast.feature_table import FeatureTable +from feast.feature_view import FeatureView +from feast.infra.local import LocalProvider +from feast.infra.offline_stores.file_source import FileSource +from feast.infra.offline_stores.offline_store import RetrievalJob +from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto +from feast.protos.feast.types.Value_pb2 import Value as ValueProto +from feast.registry import Registry +from feast.repo_config import RepoConfig +from flytekit.core.context_manager import FlyteContext +from tqdm import tqdm + + +class FlyteCustomProvider(LocalProvider): + def __init__(self, config: RepoConfig, repo_path): + super().__init__(config) + + def materialize_single_feature_view( + self, + config: RepoConfig, + feature_view: FeatureView, + start_date: datetime, + end_date: datetime, + registry: Registry, + project: str, + tqdm_builder: Callable[[int], tqdm], + ) -> None: + """ + Loads the latest feature values for a specific feature value from the offline store into the online store. + """ + self._localize_feature_view(feature_view) + + super().materialize_single_feature_view( + config, feature_view, start_date, end_date, registry, project, tqdm_builder + ) + + def get_historical_features( + self, + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pandas.DataFrame, str], + registry: Registry, + project: str, + full_feature_names: bool, + ) -> RetrievalJob: + """ + Returns a training dataframe from the offline store + """ + # We substitute the remote s3 file with a reference to a local file in each feature view being requested + for fv in feature_views: + self._localize_feature_view(fv) + + return super().get_historical_features( + config, + feature_views, + feature_refs, + entity_df, + registry, + project, + full_feature_names, + ) + + def _localize_feature_view(self, feature_view: FeatureView): + """ + This function ensures that the `FeatureView` object points to files in the local disk + """ + if not isinstance(feature_view.batch_source, FileSource): + return + # Copy parquet file to a local file + file_source: FileSource = feature_view.batch_source + random_local_path = FlyteContext.current_context().file_access.get_random_local_path(file_source.path) + FlyteContext.current_context().file_access.get_data( + file_source.path, + random_local_path, + is_multipart=True, + ) + feature_view.batch_source=FileSource( + path=random_local_path, + event_timestamp_column=file_source.event_timestamp_column, + ) diff --git a/cookbook/case_studies/feature_engineering/feast_integration/feast_dataobjects.py b/cookbook/case_studies/feature_engineering/feast_integration/feast_dataobjects.py new file mode 100644 index 000000000..891c95aac --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/feast_dataobjects.py @@ -0,0 +1,107 @@ +from flytekit.configuration import aws +from datetime import datetime +import pandas as pd +import os +from typing import Type +from dataclasses import dataclass +from dataclasses_json import dataclass_json +from feast import repo_config +from feast.feature_store import FeatureStore +from feast.repo_config import RepoConfig +from flytekit import FlyteContext +from flytekit.core.type_engine import TypeEngine, TypeTransformer +from flytekit.models.literals import Literal, Scalar +from flytekit.models.types import LiteralType, SimpleType +from feast.infra.offline_stores.file import FileOfflineStoreConfig +from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig +from feast import FeatureStore as FeastFeatureStore +from google.protobuf.struct_pb2 import Struct +from google.protobuf.json_format import MessageToDict +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from feast.entity import Entity +from feast.feature_view import FeatureView +from feast.feature_service import FeatureService + + +@dataclass_json +@dataclass +class FeatureStoreConfig: + registry_path: str + project: str + s3_bucket: str + online_store_path: str = 'online.db' + + +@dataclass_json +@dataclass +class FeatureStore: + config: FeatureStoreConfig + + def _build_feast_feature_store(self): + os.environ["FEAST_S3_ENDPOINT_URL"] = aws.S3_ENDPOINT.get() + os.environ["AWS_ACCESS_KEY_ID"] = aws.S3_ACCESS_KEY_ID.get() + os.environ["AWS_SECRET_ACCESS_KEY"] = aws.S3_SECRET_ACCESS_KEY.get() + + config = RepoConfig( + registry=f"s3://{self.config.s3_bucket}/{self.config.registry_path}", + project=self.config.project, + # Notice the use of a custom provider. + provider="custom_provider.provider.FlyteCustomProvider", + offline_store=FileOfflineStoreConfig(), + online_store=SqliteOnlineStoreConfig(path=self.config.online_store_path), + ) + return FeastFeatureStore(config=config) + + def apply( + self, + objects: Union[ + Entity, + FeatureView, + FeatureService, + List[Union[FeatureView, Entity, FeatureService]], + ], + ) -> None: + fs = self._build_feast_feature_store() + fs.apply(objects) + + # Applying also initializes the sqlite tables in the online store + FlyteContext.current_context().file_access.upload(self.config.online_store_path, f"s3://{self.config.s3_bucket}/{self.config.online_store_path}") + + def get_historical_features( + self, + entity_df: Union[pd.DataFrame, str], + features: Optional[Union[List[str], FeatureService]] = None, + ) -> pd.DataFrame: + fs = self._build_feast_feature_store() + retrieval_job = fs.get_historical_features( + entity_df=entity_df, + features=features, + ) + return retrieval_job.to_df() + + def materialize( + self, + start_date: datetime, + end_date: datetime, + feature_views: Optional[List[str]] = None, + ) -> None: + FlyteContext.current_context().file_access.download(f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", self.config.online_store_path) + fs = self._build_feast_feature_store() + fs.materialize( + start_date=start_date, + end_date=end_date, + ) + FlyteContext.current_context().file_access.upload(self.config.online_store_path, f"s3://{self.config.s3_bucket}/{self.config.online_store_path}") + + def get_online_features( + self, + features: Union[List[str], FeatureService], + entity_rows: List[Dict[str, Any]], + feature_refs: Optional[List[str]] = None, + full_feature_names: bool = False, + ) -> Dict[str, Any]: + FlyteContext.current_context().file_access.download(f"s3://{self.config.s3_bucket}/{self.config.online_store_path}", self.config.online_store_path) + fs = self._build_feast_feature_store() + + online_response = fs.get_online_features(features, entity_rows, feature_refs, full_feature_names) + return online_response.to_dict() diff --git a/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py b/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py new file mode 100644 index 000000000..6bdb08213 --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/feast_workflow.py @@ -0,0 +1,252 @@ +import os +from datetime import datetime, timedelta + +from flytekit.core.context_manager import FlyteContext + +import random +import joblib +import logging +import typing +import pandas as pd +from feast import ( + Entity, + Feature, + FeatureStore, + FeatureView, + FileSource, + RepoConfig, + ValueType, + online_response, + registry, +) +from flytekit.core.node_creation import create_node +from feast.infra.offline_stores.file import FileOfflineStoreConfig +from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig +from flytekit import reference_task, task, workflow, Workflow +from flytekit.extras.sqlite3.task import SQLite3Config, SQLite3Task +from flytekit.types.file import JoblibSerializedFile +from flytekit.types.file.file import FlyteFile +from flytekit.types.schema import FlyteSchema +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import GaussianNB +from flytekit.configuration import aws +from feature_eng_tasks import mean_median_imputer, univariate_selection +from feast_dataobjects import FeatureStore, FeatureStoreConfig + + +logger = logging.getLogger(__file__) +# TODO: find a better way to define these features. +FEAST_FEATURES = [ + "horse_colic_stats:rectal temperature", + "horse_colic_stats:total protein", + "horse_colic_stats:peripheral pulse", + "horse_colic_stats:surgical lesion", + "horse_colic_stats:abdominal distension", + "horse_colic_stats:nasogastric tube", + "horse_colic_stats:outcome", + "horse_colic_stats:packed cell volume", + "horse_colic_stats:nasogastric reflux PH", +] +DATABASE_URI = "https://cdn.discordapp.com/attachments/545481172399030272/861575373783040030/horse_colic.db.zip" +DATA_CLASS = "surgical lesion" + + +sql_task = SQLite3Task( + name="sqlite3.horse_colic", + query_template="select * from data", + output_schema_type=FlyteSchema, + task_config=SQLite3Config( + uri=DATABASE_URI, + compressed=True, + ), +) + + +@task +def store_offline(feature_store: FeatureStore, dataframe: FlyteSchema): + horse_colic_entity = Entity(name="Hospital Number", value_type=ValueType.STRING) + + horse_colic_feature_view = FeatureView( + name="horse_colic_stats", + entities=["Hospital Number"], + features=[ + Feature(name="rectal temperature", dtype=ValueType.FLOAT), + Feature(name="total protein", dtype=ValueType.FLOAT), + Feature(name="peripheral pulse", dtype=ValueType.FLOAT), + Feature(name="surgical lesion", dtype=ValueType.STRING), + Feature(name="abdominal distension", dtype=ValueType.FLOAT), + Feature(name="nasogastric tube", dtype=ValueType.STRING), + Feature(name="outcome", dtype=ValueType.STRING), + Feature(name="packed cell volume", dtype=ValueType.FLOAT), + Feature(name="nasogastric reflux PH", dtype=ValueType.FLOAT), + ], + batch_source=FileSource( + path=str(dataframe.remote_path), + event_timestamp_column="timestamp", + ), + ttl=timedelta(days=1), + ) + + # Ingest the data into feast + feature_store.apply([horse_colic_entity, horse_colic_feature_view]) + + +@task +def load_historical_features(feature_store: FeatureStore) -> FlyteSchema: + entity_df = pd.DataFrame.from_dict( + { + "Hospital Number": [ + "530101", + "5290409", + "5291329", + "530051", + "529518", + "530101", + "529340", + "5290409", + "530034", + ], + "event_timestamp": [ + datetime(2021, 6, 25, 16, 36, 27), + datetime(2021, 6, 25, 16, 36, 27), + datetime(2021, 6, 25, 16, 36, 27), + datetime(2021, 6, 25, 16, 36, 27), + datetime(2021, 6, 25, 16, 36, 27), + datetime(2021, 7, 5, 11, 36, 1), + datetime(2021, 6, 25, 16, 36, 27), + datetime(2021, 7, 5, 11, 50, 40), + datetime(2021, 6, 25, 16, 36, 27), + ], + } + ) + + return feature_store.get_historical_features( + entity_df=entity_df, + features=FEAST_FEATURES, + ) + + +# %% +# Next, we train the Naive Bayes model using the data that's been fetched from the feature store. +@task +def train_model(dataset: pd.DataFrame, data_class: str) -> JoblibSerializedFile: + X_train, _, y_train, _ = train_test_split( + dataset[dataset.columns[~dataset.columns.isin([data_class])]], + dataset[data_class], + test_size=0.33, + random_state=42, + ) + model = GaussianNB() + model.fit(X_train, y_train) + model.feature_names = list(X_train.columns.values) + fname = "/tmp/model.joblib.dat" + joblib.dump(model, fname) + return fname + +@task +def store_online(feature_store: FeatureStore): + feature_store.materialize( + start_date=datetime.utcnow() - timedelta(days=250), + end_date=datetime.utcnow() - timedelta(minutes=10), + ) + +@task +def retrieve_online( + feature_store: FeatureStore, dataset: pd.DataFrame +) -> dict: + inference_data = random.choice(dataset["Hospital Number"]) + logger.info(f"Hospital Number chosen for inference is: {inference_data}") + entity_rows = [{"Hospital Number": inference_data}] + + return feature_store.get_online_features(FEAST_FEATURES, entity_rows) + + +# %% +# We define a task to test the model using the inference point fetched earlier. +@task +def test_model( + model_ser: JoblibSerializedFile, + inference_point: dict, +) -> typing.List[str]: + + # Load model + model = joblib.load(model_ser) + f_names = model.feature_names + + test_list = [] + for each_name in f_names: + test_list.append(inference_point[each_name][0]) + prediction = model.predict([test_list]) + return prediction + + +@task +def convert_timestamp_column( + dataframe: FlyteSchema, timestamp_column: str +) -> FlyteSchema: + df = dataframe.open().all() + df[timestamp_column] = pd.to_datetime(df[timestamp_column]) + return df + +@task +def build_feature_store(s3_bucket: str, registry_path: str, online_store_path: str) -> FeatureStore: + feature_store_config = FeatureStoreConfig(project="horsecolic", s3_bucket=s3_bucket, registry_path=registry_path, online_store_path=online_store_path) + return FeatureStore(config=feature_store_config) + + +@workflow +def feast_workflow( + imputation_method: str = "mean", + num_features_univariate: int = 7, + s3_bucket: str = "feast-integration", + registry_path: str = "registry.db", + online_store_path: str = "online.db", +) -> typing.List[str]: + # Load parquet file from sqlite task + df = sql_task() + dataframe = mean_median_imputer(dataframe=df, imputation_method=imputation_method) + # Need to convert timestamp column in the underlying dataframe, otherwise its type is written as + # string. There is probably a better way of doing this conversion. + converted_df = convert_timestamp_column( + dataframe=dataframe, timestamp_column="timestamp" + ) + + feature_store = build_feature_store(s3_bucket=s3_bucket, registry_path=registry_path, online_store_path=online_store_path) + + # Ingest data into offline store + store_offline_node = create_node(store_offline, feature_store=feature_store, dataframe=converted_df) + + # Demonstrate how to load features from offline store + load_historical_features_node = create_node(load_historical_features, feature_store=feature_store) + + # Ingest data into online store + store_online_node = create_node(store_online, feature_store=feature_store) + + # Retrieve feature data from online store + retrieve_online_node = create_node(retrieve_online, feature_store=feature_store, dataset=converted_df) + + # Enforce order in which tasks that interact with the feast SDK have to run + store_offline_node >> load_historical_features_node + load_historical_features_node >> store_online_node + store_online_node >> retrieve_online_node + + # Use a feature retrieved from the online store for inference on a trained model + selected_features = univariate_selection( + dataframe=load_historical_features_node.o0, + num_features=num_features_univariate, + data_class=DATA_CLASS, + ) + trained_model = train_model( + dataset=selected_features, + data_class=DATA_CLASS, + ) + prediction = test_model( + model_ser=trained_model, + inference_point=retrieve_online_node.o0, + ) + + return prediction + + +if __name__ == "__main__": + print(f"{feast_workflow()}") diff --git a/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py b/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py new file mode 100644 index 000000000..d6b0cc66b --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/feature_eng_tasks.py @@ -0,0 +1,80 @@ +""" +Feature Engineering Tasks +------------------------- +We'll define the relevant feature engineering tasks to clean up the SQLite3 data. +""" + +# %% +# First, let's import the required libraries. +import numpy as np +import pandas as pd +from flytekit import task +from flytekit.types.schema import FlyteSchema +from numpy.core.fromnumeric import sort +from sklearn.feature_selection import SelectKBest, f_classif +from sklearn.impute import SimpleImputer + +# %% +# There are a specific set of columns for which imputation isn't required. We ignore them. +NO_IMPUTATION_COLS = [ + "Hospital Number", + "surgery", + "Age", + "outcome", + "surgical lesion", + "timestamp", +] + + +# %% +# Next, we define a ``mean_median_imputer`` task to fill in the missing values of the dataset, for which we use `SimpleImputer `__ class from the ``scikit-learn`` library. +@task +def mean_median_imputer( + dataframe: pd.DataFrame, + imputation_method: str, +) -> FlyteSchema: + dataframe = dataframe.replace("?", np.nan) + if imputation_method not in ["median", "mean"]: + raise ValueError("imputation_method takes only values 'median' or 'mean'") + + imputer = SimpleImputer(missing_values=np.nan, strategy=imputation_method) + + imputer = imputer.fit( + dataframe[dataframe.columns[~dataframe.columns.isin(NO_IMPUTATION_COLS)]] + ) + dataframe[ + dataframe.columns[~dataframe.columns.isin(NO_IMPUTATION_COLS)] + ] = imputer.transform( + dataframe[dataframe.columns[~dataframe.columns.isin(NO_IMPUTATION_COLS)]] + ) + return dataframe + + +# %% +# Let's define the other task called ``univariate_selection`` that does feature selection. +# The `SelectKBest `__ method removes all but the highest scoring features (data frame columns). +@task +def univariate_selection( + dataframe: pd.DataFrame, num_features: int, data_class: str +) -> pd.DataFrame: + # Remove ``timestamp`` and ``Hospital Number`` columns as they ought to be present in the dataset + dataframe = dataframe.drop(["event_timestamp", "Hospital Number"], axis=1) + + if num_features > 9: + raise ValueError( + f"Number of features must be <= 9; you've given {num_features}" + ) + + X = dataframe.iloc[:, dataframe.columns != data_class] + y = dataframe.loc[:, data_class] + test = SelectKBest(score_func=f_classif, k=num_features) + fit = test.fit(X, y) + indices = sort((-fit.scores_).argsort()[:num_features]) + column_names = list(map(X.columns.__getitem__, indices)) + column_names.extend([data_class]) + features = fit.transform(X) + return pd.DataFrame(np.c_[features, y.to_numpy()], columns=column_names) + + +# %% +# The aforementioned feature engineering tasks are used as ``reference tasks`` while building the Flyte pipeline with Feast. diff --git a/cookbook/case_studies/feature_engineering/feast_integration/requirements.in b/cookbook/case_studies/feature_engineering/feast_integration/requirements.in new file mode 100644 index 000000000..a166ebe4f --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/requirements.in @@ -0,0 +1,4 @@ +flytekit>=0.23.0b1 +scikit-learn +numpy +feast[aws] diff --git a/cookbook/case_studies/feature_engineering/feast_integration/requirements.txt b/cookbook/case_studies/feature_engineering/feast_integration/requirements.txt new file mode 100644 index 000000000..faa38bda6 --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/requirements.txt @@ -0,0 +1,234 @@ +# +# This file is autogenerated by pip-compile with python 3.9 +# To update, run: +# +# pip-compile requirements.in +# +attrs==21.2.0 + # via + # jsonschema + # scantree +boto3==1.17.112 + # via feast +botocore==1.20.112 + # via + # boto3 + # s3transfer +cachetools==4.2.2 + # via google-auth +certifi==2021.5.30 + # via requests +cffi==1.14.6 + # via cryptography +charset-normalizer==2.0.5 + # via requests +click==7.1.2 + # via + # feast + # flytekit +colorama==0.4.4 + # via feast +croniter==1.0.15 + # via flytekit +cryptography==3.4.8 + # via secretstorage +dataclasses-json==0.5.6 + # via flytekit +decorator==5.1.0 + # via retry +deprecated==1.2.13 + # via flytekit +dirhash==0.2.1 + # via flytekit +diskcache==5.2.1 + # via flytekit +docker-image-py==0.1.12 + # via flytekit +docstring-parser==0.10 + # via flytekit +fastavro==1.4.4 + # via + # feast + # pandavro +feast[aws]==0.12.1 + # via -r requirements.in +flyteidl==0.21.1 + # via flytekit +flytekit==0.23.0b1 + # via -r requirements.in +google-api-core==2.0.1 + # via feast +google-auth==2.1.0 + # via google-api-core +googleapis-common-protos==1.52.0 + # via + # feast + # google-api-core +grpcio==1.40.0 + # via + # feast + # flytekit +idna==3.2 + # via requests +importlib-metadata==4.8.1 + # via keyring +jeepney==0.7.1 + # via + # keyring + # secretstorage +jinja2==3.0.1 + # via feast +jmespath==0.10.0 + # via + # boto3 + # botocore +joblib==1.0.1 + # via scikit-learn +jsonschema==3.2.0 + # via feast +keyring==23.2.1 + # via flytekit +markupsafe==2.0.1 + # via jinja2 +marshmallow==3.13.0 + # via + # dataclasses-json + # marshmallow-enum + # marshmallow-jsonschema +marshmallow-enum==1.5.1 + # via dataclasses-json +marshmallow-jsonschema==0.12.0 + # via flytekit +mmh3==3.0.0 + # via feast +mypy-extensions==0.4.3 + # via typing-inspect +natsort==7.1.1 + # via flytekit +numpy==1.21.2 + # via + # -r requirements.in + # pandas + # pandavro + # pyarrow + # scikit-learn + # scipy +pandas==1.3.3 + # via + # feast + # flytekit + # pandavro +pandavro==1.5.2 + # via feast +pathspec==0.9.0 + # via scantree +protobuf==3.18.0 + # via + # feast + # flyteidl + # flytekit + # google-api-core + # googleapis-common-protos +py==1.10.0 + # via retry +pyarrow==3.0.0 + # via + # feast + # flytekit +pyasn1==0.4.8 + # via + # pyasn1-modules + # rsa +pyasn1-modules==0.2.8 + # via google-auth +pycparser==2.20 + # via cffi +pydantic==1.8.2 + # via feast +pyrsistent==0.18.0 + # via jsonschema +python-dateutil==2.8.1 + # via + # botocore + # croniter + # flytekit + # pandas +python-json-logger==2.0.2 + # via flytekit +pytimeparse==1.1.8 + # via flytekit +pytz==2018.4 + # via + # flytekit + # pandas +pyyaml==5.4.1 + # via feast +regex==2021.8.28 + # via docker-image-py +requests==2.26.0 + # via + # flytekit + # google-api-core + # responses +responses==0.14.0 + # via flytekit +retry==0.9.2 + # via flytekit +rsa==4.7.2 + # via google-auth +s3transfer==0.4.2 + # via boto3 +scantree==0.0.1 + # via dirhash +scikit-learn==0.24.2 + # via -r requirements.in +scipy==1.7.1 + # via scikit-learn +secretstorage==3.3.1 + # via keyring +six==1.16.0 + # via + # flytekit + # grpcio + # jsonschema + # pandavro + # python-dateutil + # responses + # scantree +sortedcontainers==2.4.0 + # via flytekit +statsd==3.3.0 + # via flytekit +tabulate==0.8.9 + # via feast +tenacity==8.0.1 + # via feast +threadpoolctl==2.2.0 + # via scikit-learn +toml==0.10.2 + # via feast +tqdm==4.62.2 + # via feast +typing-extensions==3.10.0.2 + # via + # pydantic + # typing-inspect +typing-inspect==0.7.1 + # via dataclasses-json +urllib3==1.26.6 + # via + # botocore + # flytekit + # requests + # responses +wheel==0.37.0 + # via flytekit +wrapt==1.12.1 + # via + # deprecated + # flytekit +zipp==3.5.0 + # via importlib-metadata + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/cookbook/case_studies/feature_engineering/feast_integration/sandbox.config b/cookbook/case_studies/feature_engineering/feast_integration/sandbox.config new file mode 100644 index 000000000..437e2cd1f --- /dev/null +++ b/cookbook/case_studies/feature_engineering/feast_integration/sandbox.config @@ -0,0 +1,3 @@ +[sdk] +workflow_packages=feast_integration +python_venv=flytekit_venv