diff --git a/components/gcp/automl/create_dataset_for_tables/component.py b/components/gcp/automl/create_dataset_for_tables/component.py new file mode 100644 index 00000000000..644fd647509 --- /dev/null +++ b/components/gcp/automl/create_dataset_for_tables/component.py @@ -0,0 +1,58 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + + +def automl_create_dataset_for_tables( + gcp_project_id: str, + gcp_region: str, + display_name: str, + description: str = None, + tables_dataset_metadata: dict = {}, + retry=None, #=google.api_core.gapic_v1.method.DEFAULT, + timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, +) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str)]): + '''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables + ''' + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + import google + from google.cloud import automl + client = automl.AutoMlClient() + + location_path = client.location_path(gcp_project_id, gcp_region) + dataset_dict = { + 'display_name': display_name, + 'description': description, + 'tables_dataset_metadata': tables_dataset_metadata, + } + dataset = client.create_dataset( + location_path, + dataset_dict, + retry or google.api_core.gapic_v1.method.DEFAULT, + timeout or google.api_core.gapic_v1.method.DEFAULT, + metadata, + ) + print(dataset) + dataset_id = dataset.name.rsplit('/', 1)[-1] + return (dataset.name, dataset.create_time, dataset_id) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(automl_create_dataset_for_tables, output_component_file='component.yaml', base_image='python:3.7') diff --git a/components/gcp/automl/create_dataset_for_tables/component.yaml b/components/gcp/automl/create_dataset_for_tables/component.yaml new file mode 100644 index 00000000000..4999a022c68 --- /dev/null +++ b/components/gcp/automl/create_dataset_for_tables/component.yaml @@ -0,0 +1,149 @@ +description: | + automl_create_dataset_for_tables creates an empty Dataset for AutoML tables +implementation: + container: + args: + - --gcp-project-id + - inputValue: gcp_project_id + - --gcp-region + - inputValue: gcp_region + - --display-name + - inputValue: display_name + - if: + cond: + isPresent: description + then: + - --description + - inputValue: description + - if: + cond: + isPresent: tables_dataset_metadata + then: + - --tables-dataset-metadata + - inputValue: tables_dataset_metadata + - if: + cond: + isPresent: retry + then: + - --retry + - inputValue: retry + - if: + cond: + isPresent: timeout + then: + - --timeout + - inputValue: timeout + - if: + cond: + isPresent: metadata + then: + - --metadata + - inputValue: metadata + - '----output-paths' + - outputPath: dataset_path + - outputPath: create_time + - outputPath: dataset_id + command: + - python3 + - -u + - -c + - | + from typing import NamedTuple + + def automl_create_dataset_for_tables( + gcp_project_id: str, + gcp_region: str, + display_name: str, + description: str = None, + tables_dataset_metadata: dict = {}, + retry=None, #=google.api_core.gapic_v1.method.DEFAULT, + timeout: float = None, #=google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, + ) -> NamedTuple('Outputs', [('dataset_path', str), ('create_time', str), ('dataset_id', str)]): + '''automl_create_dataset_for_tables creates an empty Dataset for AutoML tables + ''' + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + import google + from google.cloud import automl + client = automl.AutoMlClient() + + location_path = client.location_path(gcp_project_id, gcp_region) + dataset_dict = { + 'display_name': display_name, + 'description': description, + 'tables_dataset_metadata': tables_dataset_metadata, + } + dataset = client.create_dataset( + location_path, + dataset_dict, + retry or google.api_core.gapic_v1.method.DEFAULT, + timeout or google.api_core.gapic_v1.method.DEFAULT, + metadata, + ) + print(dataset) + dataset_id = dataset.name.rsplit('/', 1)[-1] + return (dataset.name, dataset.create_time, dataset_id) + + import json + import argparse + _missing_arg = object() + _parser = argparse.ArgumentParser(prog='Automl create dataset for tables', description='automl_create_dataset_for_tables creates an empty Dataset for AutoML tables\n') + _parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=_missing_arg) + _parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=_missing_arg) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=_missing_arg) + _parser.add_argument("--description", dest="description", type=str, required=False, default=_missing_arg) + _parser.add_argument("--tables-dataset-metadata", dest="tables_dataset_metadata", type=json.loads, required=False, default=_missing_arg) + _parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg) + _parser.add_argument("--timeout", dest="timeout", type=float, required=False, default=_missing_arg) + _parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=3) + _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = automl_create_dataset_for_tables(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(str(_outputs[idx])) + image: python:3.7 +inputs: +- name: gcp_project_id + type: String +- name: gcp_region + type: String +- name: display_name + type: String +- name: description + optional: true + type: String +- default: '{}' + name: tables_dataset_metadata + optional: true + type: JsonObject +- name: retry + optional: true +- name: timeout + optional: true + type: Float +- name: metadata + optional: true + type: JsonObject +name: Automl create dataset for tables +outputs: +- name: dataset_path + type: String +- name: create_time + type: String +- name: dataset_id + type: String diff --git a/components/gcp/automl/create_model_for_tables/component.py b/components/gcp/automl/create_model_for_tables/component.py new file mode 100644 index 00000000000..17bc681e7b6 --- /dev/null +++ b/components/gcp/automl/create_model_for_tables/component.py @@ -0,0 +1,58 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + + +def automl_create_model_for_tables( + gcp_project_id: str, + gcp_region: str, + display_name: str, + dataset_id: str, + target_column_path: str = None, + input_feature_column_paths: list = None, + optimization_objective: str = 'MAXIMIZE_AU_PRC', + train_budget_milli_node_hours: int = 1000, +) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + from google.cloud import automl + client = automl.AutoMlClient() + + location_path = client.location_path(gcp_project_id, gcp_region) + model_dict = { + 'display_name': display_name, + 'dataset_id': dataset_id, + 'tables_model_metadata': { + 'target_column_spec': automl.types.ColumnSpec(name=target_column_path), + 'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None, + 'optimization_objective': optimization_objective, + 'train_budget_milli_node_hours': train_budget_milli_node_hours, + }, + } + + create_model_response = client.create_model(location_path, model_dict) + print('Create model operation: {}'.format(create_model_response.operation)) + result = create_model_response.result() + print(result) + model_name = result.name + model_id = model_name.rsplit('/', 1)[-1] + return (model_name, model_id) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(automl_create_model_for_tables, output_component_file='component.yaml', base_image='python:3.7') diff --git a/components/gcp/automl/create_model_for_tables/component.yaml b/components/gcp/automl/create_model_for_tables/component.yaml new file mode 100644 index 00000000000..3bcc0fde2c9 --- /dev/null +++ b/components/gcp/automl/create_model_for_tables/component.yaml @@ -0,0 +1,141 @@ +implementation: + container: + args: + - --gcp-project-id + - inputValue: gcp_project_id + - --gcp-region + - inputValue: gcp_region + - --display-name + - inputValue: display_name + - --dataset-id + - inputValue: dataset_id + - if: + cond: + isPresent: target_column_path + then: + - --target-column-path + - inputValue: target_column_path + - if: + cond: + isPresent: input_feature_column_paths + then: + - --input-feature-column-paths + - inputValue: input_feature_column_paths + - if: + cond: + isPresent: optimization_objective + then: + - --optimization-objective + - inputValue: optimization_objective + - if: + cond: + isPresent: train_budget_milli_node_hours + then: + - --train-budget-milli-node-hours + - inputValue: train_budget_milli_node_hours + - '----output-paths' + - outputPath: model_path + - outputPath: model_id + command: + - python3 + - -u + - -c + - | + from typing import NamedTuple + + def automl_create_model_for_tables( + gcp_project_id: str, + gcp_region: str, + display_name: str, + dataset_id: str, + target_column_path: str = None, + input_feature_column_paths: list = None, + optimization_objective: str = 'MAXIMIZE_AU_PRC', + train_budget_milli_node_hours: int = 1000, + ) -> NamedTuple('Outputs', [('model_path', str), ('model_id', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + from google.cloud import automl + client = automl.AutoMlClient() + + location_path = client.location_path(gcp_project_id, gcp_region) + model_dict = { + 'display_name': display_name, + 'dataset_id': dataset_id, + 'tables_model_metadata': { + 'target_column_spec': automl.types.ColumnSpec(name=target_column_path), + 'input_feature_column_specs': [automl.types.ColumnSpec(name=path) for path in input_feature_column_paths] if input_feature_column_paths else None, + 'optimization_objective': optimization_objective, + 'train_budget_milli_node_hours': train_budget_milli_node_hours, + }, + } + + create_model_response = client.create_model(location_path, model_dict) + print('Create model operation: {}'.format(create_model_response.operation)) + result = create_model_response.result() + print(result) + model_name = result.name + model_id = model_name.rsplit('/', 1)[-1] + return (model_name, model_id) + + import json + import argparse + _missing_arg = object() + _parser = argparse.ArgumentParser(prog='Automl create model for tables', description='') + _parser.add_argument("--gcp-project-id", dest="gcp_project_id", type=str, required=True, default=_missing_arg) + _parser.add_argument("--gcp-region", dest="gcp_region", type=str, required=True, default=_missing_arg) + _parser.add_argument("--display-name", dest="display_name", type=str, required=True, default=_missing_arg) + _parser.add_argument("--dataset-id", dest="dataset_id", type=str, required=True, default=_missing_arg) + _parser.add_argument("--target-column-path", dest="target_column_path", type=str, required=False, default=_missing_arg) + _parser.add_argument("--input-feature-column-paths", dest="input_feature_column_paths", type=json.loads, required=False, default=_missing_arg) + _parser.add_argument("--optimization-objective", dest="optimization_objective", type=str, required=False, default=_missing_arg) + _parser.add_argument("--train-budget-milli-node-hours", dest="train_budget_milli_node_hours", type=int, required=False, default=_missing_arg) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) + _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = automl_create_model_for_tables(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(str(_outputs[idx])) + image: python:3.7 +inputs: +- name: gcp_project_id + type: String +- name: gcp_region + type: String +- name: display_name + type: String +- name: dataset_id + type: String +- name: target_column_path + optional: true + type: String +- name: input_feature_column_paths + optional: true + type: JsonArray +- default: MAXIMIZE_AU_PRC + name: optimization_objective + optional: true + type: String +- default: '1000' + name: train_budget_milli_node_hours + optional: true + type: Integer +name: Automl create model for tables +outputs: +- name: model_path + type: String +- name: model_id + type: String diff --git a/components/gcp/automl/import_data_from_bigquery/component.py b/components/gcp/automl/import_data_from_bigquery/component.py new file mode 100644 index 00000000000..61df403cee4 --- /dev/null +++ b/components/gcp/automl/import_data_from_bigquery/component.py @@ -0,0 +1,53 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + + +def automl_import_data_from_bigquery( + dataset_path, + input_uri: str, + retry=None, #=google.api_core.gapic_v1.method.DEFAULT, + timeout=None, #=google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, +) -> NamedTuple('Outputs', [('dataset_path', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + import google + from google.cloud import automl + client = automl.AutoMlClient() + input_config = { + 'bigquery_source': { + 'input_uri': input_uri, + }, + } + response = client.import_data( + dataset_path, + input_config, + retry or google.api_core.gapic_v1.method.DEFAULT, + timeout or google.api_core.gapic_v1.method.DEFAULT, + metadata, + ) + result = response.result() + print(result) + metadata = response.metadata + print(metadata) + return (dataset_path) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(automl_import_data_from_bigquery, output_component_file='component.yaml', base_image='python:3.7') diff --git a/components/gcp/automl/import_data_from_bigquery/component.yaml b/components/gcp/automl/import_data_from_bigquery/component.yaml new file mode 100644 index 00000000000..1a5a996ed13 --- /dev/null +++ b/components/gcp/automl/import_data_from_bigquery/component.yaml @@ -0,0 +1,108 @@ +implementation: + container: + args: + - --dataset-path + - inputValue: dataset_path + - --input-uri + - inputValue: input_uri + - if: + cond: + isPresent: retry + then: + - --retry + - inputValue: retry + - if: + cond: + isPresent: timeout + then: + - --timeout + - inputValue: timeout + - if: + cond: + isPresent: metadata + then: + - --metadata + - inputValue: metadata + - '----output-paths' + - outputPath: dataset_path + command: + - python3 + - -u + - -c + - | + from typing import NamedTuple + + def automl_import_data_from_bigquery( + dataset_path, + input_uri: str, + retry=None, #=google.api_core.gapic_v1.method.DEFAULT, + timeout=None, #=google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, + ) -> NamedTuple('Outputs', [('dataset_path', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + import google + from google.cloud import automl + client = automl.AutoMlClient() + input_config = { + 'bigquery_source': { + 'input_uri': input_uri, + }, + } + response = client.import_data( + dataset_path, + input_config, + retry or google.api_core.gapic_v1.method.DEFAULT, + timeout or google.api_core.gapic_v1.method.DEFAULT, + metadata, + ) + result = response.result() + print(result) + metadata = response.metadata + print(metadata) + return (dataset_path) + + import json + import argparse + _missing_arg = object() + _parser = argparse.ArgumentParser(prog='Automl import data from bigquery', description='') + _parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg) + _parser.add_argument("--input-uri", dest="input_uri", type=str, required=True, default=_missing_arg) + _parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg) + _parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg) + _parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) + _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = automl_import_data_from_bigquery(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(str(_outputs[idx])) + image: python:3.7 +inputs: +- name: dataset_path +- name: input_uri + type: String +- name: retry + optional: true +- name: timeout + optional: true +- name: metadata + optional: true + type: JsonObject +name: Automl import data from bigquery +outputs: +- name: dataset_path + type: String diff --git a/components/gcp/automl/import_data_from_gcs/component.py b/components/gcp/automl/import_data_from_gcs/component.py new file mode 100644 index 00000000000..acc69950f7a --- /dev/null +++ b/components/gcp/automl/import_data_from_gcs/component.py @@ -0,0 +1,53 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + + +def automl_import_data_from_gcs( + dataset_path: str, + input_uris: list, + retry=None, #=google.api_core.gapic_v1.method.DEFAULT, + timeout=None, #=google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, +) -> NamedTuple('Outputs', [('dataset_path', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + import google + from google.cloud import automl + client = automl.AutoMlClient() + input_config = { + 'gcs_source': { + 'input_uris': input_uris, + }, + } + response = client.import_data( + dataset_path, + input_config, + retry or google.api_core.gapic_v1.method.DEFAULT, + timeout or google.api_core.gapic_v1.method.DEFAULT, + metadata, + ) + result = response.result() + print(result) + metadata = response.metadata + print(metadata) + return (dataset_path) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(automl_import_data_from_gcs, output_component_file='component.yaml', base_image='python:3.7') diff --git a/components/gcp/automl/import_data_from_gcs/component.yaml b/components/gcp/automl/import_data_from_gcs/component.yaml new file mode 100644 index 00000000000..b5cc16c39e0 --- /dev/null +++ b/components/gcp/automl/import_data_from_gcs/component.yaml @@ -0,0 +1,109 @@ +implementation: + container: + args: + - --dataset-path + - inputValue: dataset_path + - --input-uris + - inputValue: input_uris + - if: + cond: + isPresent: retry + then: + - --retry + - inputValue: retry + - if: + cond: + isPresent: timeout + then: + - --timeout + - inputValue: timeout + - if: + cond: + isPresent: metadata + then: + - --metadata + - inputValue: metadata + - '----output-paths' + - outputPath: dataset_path + command: + - python3 + - -u + - -c + - | + from typing import NamedTuple + + def automl_import_data_from_gcs( + dataset_path: str, + input_uris: list, + retry=None, #=google.api_core.gapic_v1.method.DEFAULT, + timeout=None, #=google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, + ) -> NamedTuple('Outputs', [('dataset_path', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + import google + from google.cloud import automl + client = automl.AutoMlClient() + input_config = { + 'gcs_source': { + 'input_uris': input_uris, + }, + } + response = client.import_data( + dataset_path, + input_config, + retry or google.api_core.gapic_v1.method.DEFAULT, + timeout or google.api_core.gapic_v1.method.DEFAULT, + metadata, + ) + result = response.result() + print(result) + metadata = response.metadata + print(metadata) + return (dataset_path) + + import json + import argparse + _missing_arg = object() + _parser = argparse.ArgumentParser(prog='Automl import data from gcs', description='') + _parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg) + _parser.add_argument("--input-uris", dest="input_uris", type=json.loads, required=True, default=_missing_arg) + _parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg) + _parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg) + _parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) + _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = automl_import_data_from_gcs(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(str(_outputs[idx])) + image: python:3.7 +inputs: +- name: dataset_path + type: String +- name: input_uris + type: JsonArray +- name: retry + optional: true +- name: timeout + optional: true +- name: metadata + optional: true + type: JsonObject +name: Automl import data from gcs +outputs: +- name: dataset_path + type: String diff --git a/components/gcp/automl/prediction_service_batch_predict/component.py b/components/gcp/automl/prediction_service_batch_predict/component.py new file mode 100644 index 00000000000..923ac013dc4 --- /dev/null +++ b/components/gcp/automl/prediction_service_batch_predict/component.py @@ -0,0 +1,69 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + + +def automl_prediction_service_batch_predict( + model_path, + gcs_input_uris: str = None, + gcs_output_uri_prefix: str = None, + bq_input_uri: str = None, + bq_output_uri: str = None, + params=None, + retry=None, #google.api_core.gapic_v1.method.DEFAULT, + timeout=None, #google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, +) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + input_config = {} + if gcs_input_uris: + input_config['gcs_source'] = {'input_uris': gcs_input_uris} + if bq_input_uri: + input_config['bigquery_source'] = {'input_uri': bq_input_uri} + + output_config = {} + if gcs_output_uri_prefix: + output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix} + if bq_output_uri: + output_config['bigquery_destination'] = {'output_uri': bq_output_uri} + + from google.cloud import automl + client = automl.PredictionServiceClient() + response = client.batch_predict( + model_path, + input_config, + output_config, + params, + retry, + timeout, + metadata, + ) + print('Operation started:') + print(response.operation) + result = response.result() + metadata = response.metadata + print('Operation finished:') + print(metadata) + output_info = metadata.batch_predict_details.output_info + # Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo/pull/1277/files#r326028422 + return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-') + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(automl_prediction_service_batch_predict, output_component_file='component.yaml', base_image='python:3.7') diff --git a/components/gcp/automl/prediction_service_batch_predict/component.yaml b/components/gcp/automl/prediction_service_batch_predict/component.yaml new file mode 100644 index 00000000000..0299f34c4b9 --- /dev/null +++ b/components/gcp/automl/prediction_service_batch_predict/component.yaml @@ -0,0 +1,171 @@ +implementation: + container: + args: + - --model-path + - inputValue: model_path + - if: + cond: + isPresent: gcs_input_uris + then: + - --gcs-input-uris + - inputValue: gcs_input_uris + - if: + cond: + isPresent: gcs_output_uri_prefix + then: + - --gcs-output-uri-prefix + - inputValue: gcs_output_uri_prefix + - if: + cond: + isPresent: bq_input_uri + then: + - --bq-input-uri + - inputValue: bq_input_uri + - if: + cond: + isPresent: bq_output_uri + then: + - --bq-output-uri + - inputValue: bq_output_uri + - if: + cond: + isPresent: params + then: + - --params + - inputValue: params + - if: + cond: + isPresent: retry + then: + - --retry + - inputValue: retry + - if: + cond: + isPresent: timeout + then: + - --timeout + - inputValue: timeout + - if: + cond: + isPresent: metadata + then: + - --metadata + - inputValue: metadata + - '----output-paths' + - outputPath: gcs_output_directory + - outputPath: bigquery_output_dataset + command: + - python3 + - -u + - -c + - | + from typing import NamedTuple + + def automl_prediction_service_batch_predict( + model_path, + gcs_input_uris: str = None, + gcs_output_uri_prefix: str = None, + bq_input_uri: str = None, + bq_output_uri: str = None, + params=None, + retry=None, #google.api_core.gapic_v1.method.DEFAULT, + timeout=None, #google.api_core.gapic_v1.method.DEFAULT, + metadata: dict = None, + ) -> NamedTuple('Outputs', [('gcs_output_directory', str), ('bigquery_output_dataset', str)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + input_config = {} + if gcs_input_uris: + input_config['gcs_source'] = {'input_uris': gcs_input_uris} + if bq_input_uri: + input_config['bigquery_source'] = {'input_uri': bq_input_uri} + + output_config = {} + if gcs_output_uri_prefix: + output_config['gcs_destination'] = {'output_uri_prefix': gcs_output_uri_prefix} + if bq_output_uri: + output_config['bigquery_destination'] = {'output_uri': bq_output_uri} + + from google.cloud import automl + client = automl.PredictionServiceClient() + response = client.batch_predict( + model_path, + input_config, + output_config, + params, + retry, + timeout, + metadata, + ) + print('Operation started:') + print(response.operation) + result = response.result() + metadata = response.metadata + print('Operation finished:') + print(metadata) + output_info = metadata.batch_predict_details.output_info + # Workaround for Argo issue - it fails when output is empty: https://github.com/argoproj/argo/pull/1277/files#r326028422 + return (output_info.gcs_output_directory or '-', output_info.bigquery_output_dataset or '-') + + import json + import argparse + _missing_arg = object() + _parser = argparse.ArgumentParser(prog='Automl prediction service batch predict', description='') + _parser.add_argument("--model-path", dest="model_path", type=str, required=True, default=_missing_arg) + _parser.add_argument("--gcs-input-uris", dest="gcs_input_uris", type=str, required=False, default=_missing_arg) + _parser.add_argument("--gcs-output-uri-prefix", dest="gcs_output_uri_prefix", type=str, required=False, default=_missing_arg) + _parser.add_argument("--bq-input-uri", dest="bq_input_uri", type=str, required=False, default=_missing_arg) + _parser.add_argument("--bq-output-uri", dest="bq_output_uri", type=str, required=False, default=_missing_arg) + _parser.add_argument("--params", dest="params", type=str, required=False, default=_missing_arg) + _parser.add_argument("--retry", dest="retry", type=str, required=False, default=_missing_arg) + _parser.add_argument("--timeout", dest="timeout", type=str, required=False, default=_missing_arg) + _parser.add_argument("--metadata", dest="metadata", type=json.loads, required=False, default=_missing_arg) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) + _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = automl_prediction_service_batch_predict(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(str(_outputs[idx])) + image: python:3.7 +inputs: +- name: model_path +- name: gcs_input_uris + optional: true + type: String +- name: gcs_output_uri_prefix + optional: true + type: String +- name: bq_input_uri + optional: true + type: String +- name: bq_output_uri + optional: true + type: String +- name: params + optional: true +- name: retry + optional: true +- name: timeout + optional: true +- name: metadata + optional: true + type: JsonObject +name: Automl prediction service batch predict +outputs: +- name: gcs_output_directory + type: String +- name: bigquery_output_dataset + type: String diff --git a/components/gcp/automl/split_dataset_table_column_names/component.py b/components/gcp/automl/split_dataset_table_column_names/component.py new file mode 100644 index 00000000000..0f19842a03c --- /dev/null +++ b/components/gcp/automl/split_dataset_table_column_names/component.py @@ -0,0 +1,50 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import NamedTuple + + +def automl_split_dataset_table_column_names( + dataset_path: str, + target_column_name: str, + table_index: int = 0, +) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + from google.cloud import automl + client = automl.AutoMlClient() + list_table_specs_response = client.list_table_specs(dataset_path) + table_specs = [s for s in list_table_specs_response] + print('table_specs=') + print(table_specs) + table_spec_name = table_specs[table_index].name + + list_column_specs_response = client.list_column_specs(table_spec_name) + column_specs = [s for s in list_column_specs_response] + print('column_specs=') + print(column_specs) + + target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0] + feature_column_specs = [s for s in column_specs if s.display_name != target_column_name] + feature_column_names = [s.name for s in feature_column_specs] + + import json + return (target_column_spec.name, json.dumps(feature_column_names)) + + +if __name__ == '__main__': + import kfp + kfp.components.func_to_container_op(automl_split_dataset_table_column_names, output_component_file='component.yaml', base_image='python:3.7') diff --git a/components/gcp/automl/split_dataset_table_column_names/component.yaml b/components/gcp/automl/split_dataset_table_column_names/component.yaml new file mode 100644 index 00000000000..27e9fee18e9 --- /dev/null +++ b/components/gcp/automl/split_dataset_table_column_names/component.yaml @@ -0,0 +1,91 @@ +implementation: + container: + args: + - --dataset-path + - inputValue: dataset_path + - --target-column-name + - inputValue: target_column_name + - if: + cond: + isPresent: table_index + then: + - --table-index + - inputValue: table_index + - '----output-paths' + - outputPath: target_column_path + - outputPath: feature_column_paths + command: + - python3 + - -u + - -c + - | + from typing import NamedTuple + + def automl_split_dataset_table_column_names( + dataset_path: str, + target_column_name: str, + table_index: int = 0, + ) -> NamedTuple('Outputs', [('target_column_path', str), ('feature_column_paths', list)]): + import sys + import subprocess + subprocess.run([sys.executable, '-m', 'pip', 'install', 'google-cloud-automl==0.4.0', '--quiet', '--no-warn-script-location'], env={'PIP_DISABLE_PIP_VERSION_CHECK': '1'}, check=True) + + from google.cloud import automl + client = automl.AutoMlClient() + list_table_specs_response = client.list_table_specs(dataset_path) + table_specs = [s for s in list_table_specs_response] + print('table_specs=') + print(table_specs) + table_spec_name = table_specs[table_index].name + + list_column_specs_response = client.list_column_specs(table_spec_name) + column_specs = [s for s in list_column_specs_response] + print('column_specs=') + print(column_specs) + + target_column_spec = [s for s in column_specs if s.display_name == target_column_name][0] + feature_column_specs = [s for s in column_specs if s.display_name != target_column_name] + feature_column_names = [s.name for s in feature_column_specs] + + import json + return (target_column_spec.name, json.dumps(feature_column_names)) + + import argparse + _missing_arg = object() + _parser = argparse.ArgumentParser(prog='Automl split dataset table column names', description='') + _parser.add_argument("--dataset-path", dest="dataset_path", type=str, required=True, default=_missing_arg) + _parser.add_argument("--target-column-name", dest="target_column_name", type=str, required=True, default=_missing_arg) + _parser.add_argument("--table-index", dest="table_index", type=int, required=False, default=_missing_arg) + _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=2) + _parsed_args = {k: v for k, v in vars(_parser.parse_args()).items() if v is not _missing_arg} + _output_files = _parsed_args.pop("_output_paths", []) + + _outputs = automl_split_dataset_table_column_names(**_parsed_args) + + if not hasattr(_outputs, '__getitem__') or isinstance(_outputs, str): + _outputs = [_outputs] + + import os + for idx, output_file in enumerate(_output_files): + try: + os.makedirs(os.path.dirname(output_file)) + except OSError: + pass + with open(output_file, 'w') as f: + f.write(str(_outputs[idx])) + image: python:3.7 +inputs: +- name: dataset_path + type: String +- name: target_column_name + type: String +- default: '0' + name: table_index + optional: true + type: Integer +name: Automl split dataset table column names +outputs: +- name: target_column_path + type: String +- name: feature_column_paths + type: JsonArray diff --git a/samples/core/AutoML tables/AutoML Tables - Retail product stockout prediction.ipynb b/samples/core/AutoML tables/AutoML Tables - Retail product stockout prediction.ipynb new file mode 100644 index 00000000000..7a0e0c33fde --- /dev/null +++ b/samples/core/AutoML tables/AutoML Tables - Retail product stockout prediction.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kubeflow Pipelines - Retail Product Stockouts Prediction using AutoML Tables\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Configuration\n", + "\n", + "PROJECT_ID = \"\"\n", + "COMPUTE_REGION = \"us-central1\" # Currently \"us-central1\" is the only region supported by AutoML tables.\n", + "# The bucket must be Regional (not multi-regional) and the region should be us-central1. This is a limitation of the batch prediction service.\n", + "batch_predict_gcs_output_uri_prefix = 'gs:////'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# AutoML Tables components\n", + "\n", + "from kfp.components import load_component_from_url\n", + "\n", + "automl_create_dataset_for_tables_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_dataset_for_tables/component.yaml')\n", + "automl_import_data_from_bigquery_source_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/import_data_from_bigquery/component.yaml')\n", + "automl_create_model_for_tables_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/create_model_for_tables/component.yaml')\n", + "automl_prediction_service_batch_predict_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/prediction_service_batch_predict/component.yaml')\n", + "automl_split_dataset_table_column_names_op = load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/b3179d86b239a08bf4884b50dbf3a9151da96d66/components/gcp/automl/split_dataset_table_column_names/component.yaml')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the pipeline\n", + "import kfp\n", + "\n", + "def retail_product_stockout_prediction_pipeline(\n", + " gcp_project_id: str,\n", + " gcp_region: str,\n", + " batch_predict_gcs_output_uri_prefix: str,\n", + " dataset_bq_input_uri: str = 'bq://product-stockout.product_stockout.stockout',\n", + " dataset_display_name: str = 'stockout_data',\n", + " target_column_name: str = 'Stockout',\n", + " model_display_name: str = 'stockout_model',\n", + " batch_predict_bq_input_uri: str = 'bq://product-stockout.product_stockout.batch_prediction_inputs',\n", + " train_budget_milli_node_hours: 'Integer' = 1000,\n", + "):\n", + " # Create dataset\n", + " create_dataset_task = automl_create_dataset_for_tables_op(\n", + " gcp_project_id=gcp_project_id,\n", + " gcp_region=gcp_region,\n", + " display_name=dataset_display_name,\n", + " )\n", + "\n", + " # Import data\n", + " import_data_task = automl_import_data_from_bigquery_source_op(\n", + " dataset_path=create_dataset_task.outputs['dataset_path'],\n", + " input_uri=dataset_bq_input_uri,\n", + " )\n", + " \n", + " # Prepare column schemas\n", + " split_column_specs = automl_split_dataset_table_column_names_op(\n", + " dataset_path=import_data_task.outputs['dataset_path'],\n", + " table_index=0,\n", + " target_column_name=target_column_name,\n", + " )\n", + " \n", + " # Train a model\n", + " create_model_task = automl_create_model_for_tables_op(\n", + " gcp_project_id=gcp_project_id,\n", + " gcp_region=gcp_region,\n", + " display_name=model_display_name,\n", + " dataset_id=create_dataset_task.outputs['dataset_id'],\n", + " target_column_path=split_column_specs.outputs['target_column_path'],\n", + " #input_feature_column_paths=None, # All non-target columns will be used if None is passed\n", + " input_feature_column_paths=split_column_specs.outputs['feature_column_paths'],\n", + " optimization_objective='MAXIMIZE_AU_PRC',\n", + " train_budget_milli_node_hours=train_budget_milli_node_hours,\n", + " ).after(import_data_task)\n", + "\n", + " # Batch prediction\n", + " batch_predict_task = automl_prediction_service_batch_predict_op(\n", + " model_path=create_model_task.outputs['model_path'],\n", + " bq_input_uri=batch_predict_bq_input_uri, \n", + " gcs_output_uri_prefix=batch_predict_gcs_output_uri_prefix,\n", + " )\n", + " \n", + " from kfp.gcp import use_gcp_secret\n", + " kfp.dsl.get_pipeline_conf().add_op_transformer(use_gcp_secret('user-gcp-sa'))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the pipeline\n", + "\n", + "# Get the GCP location of your project.\n", + "from google.cloud import automl\n", + "location_path = automl.AutoMlClient().location_path(PROJECT_ID, COMPUTE_REGION)\n", + "\n", + "kfp.run_pipeline_func_on_cluster(\n", + " retail_product_stockout_prediction_pipeline,\n", + " arguments=dict(\n", + " gcp_project_id=PROJECT_ID,\n", + " gcp_region=COMPUTE_REGION,\n", + " dataset_bq_input_uri='bq://product-stockout.product_stockout.stockout',\n", + " batch_predict_bq_input_uri='bq://product-stockout.product_stockout.batch_prediction_inputs',\n", + " batch_predict_gcs_output_uri_prefix=batch_predict_gcs_output_uri_prefix,\n", + " )\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}