From fe90742ee1733d2baabf95113640483833436bb1 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 12 Dec 2018 15:57:50 -0800 Subject: [PATCH 01/13] Added component definitions to our components Added the Kubeflow training sample pipeline that uses components --- components/dataflow/predict/component.yaml | 30 +++++++ components/dataflow/tft/component.yaml | 29 +++++++ components/kubeflow/dnntrainer/component.yaml | 35 ++++++++ .../local/confusion_matrix/component.yaml | 20 +++++ components/local/roc/component.yaml | 21 +++++ .../kubeflow-training-classification.py | 80 ++++--------------- 6 files changed, 151 insertions(+), 64 deletions(-) create mode 100644 components/dataflow/predict/component.yaml create mode 100644 components/dataflow/tft/component.yaml create mode 100644 components/kubeflow/dnntrainer/component.yaml create mode 100644 components/local/confusion_matrix/component.yaml create mode 100644 components/local/roc/component.yaml diff --git a/components/dataflow/predict/component.yaml b/components/dataflow/predict/component.yaml new file mode 100644 index 00000000000..4ea14ecced6 --- /dev/null +++ b/components/dataflow/predict/component.yaml @@ -0,0 +1,30 @@ +name: Predict using TF on Dataflow +description: | + Runs TensorFlow prediction on Google Cloud Dataflow + Input and output data is in GCS +inputs: + - {name: Data file pattern, type: {GcsUriPattern: [text, CSV]}, description: 'GCS or local path of test file patterns.'} + - {name: Schema, type: {GcsUri: [text, json]}, description: 'GCS json schema file path.'} + - {name: Target, type: String, description: 'Name of the column for prediction target.'} + - {name: Model, type: {GcsUri: [Directory, Exported TensorFlow models]}, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer +# - {name: Batch size, type: Integer, default: 32, description: 'Batch size used in prediction.'} + - {name: Run mode, type: {Enum: [local, cloud]}, description: 'Whether to run the job locally or in Cloud Dataflow.'} + - {name: GCP project, type: GCP project, description: 'The GCP project to run the dataflow job.'} + - {name: Predictions dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file +outputs: + - {name: Predictions dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 + command: [python, /ml/predict.py] #python2.7 + args: [ + --data, {inputValue: Data file pattern}, + --schema, {inputValue: Schema}, + --target, {inputValue: Target}, + --model, {inputValue: Model}, + --mode, {inputValue: Run mode}, + --project, {inputValue: GCP project}, + --output, {inputValue: Predictions dir}, + ] + fileOutputs: + Predictions dir: /output.txt diff --git a/components/dataflow/tft/component.yaml b/components/dataflow/tft/component.yaml new file mode 100644 index 00000000000..0e41a967d9a --- /dev/null +++ b/components/dataflow/tft/component.yaml @@ -0,0 +1,29 @@ +name: Transform using TF on Dataflow +description: | + Runs TensorFlow Transform on Google Cloud Dataflow + Input and output data is in GCS +inputs: + - {name: Training data file pattern, type: {GcsUriPattern: [text, CSV]}, description: 'GCS path of train file patterns.'} #Also supports local CSV + - {name: Evaluation data file pattern, type: {GcsUriPattern: [text, CSV]}, description: 'GCS path of eval file patterns.'} #Also supports local CSV + - {name: Schema, type: {GcsUri: [text, json]}, description: 'GCS json schema file path.'} + - {name: GCP project, type: GCP project, description: 'The GCP project to run the dataflow job.'} + - {name: Run mode, type: {Enum: [local, cloud]}, description: 'Whether to run the job locally or in Cloud Dataflow.'} + - {name: Preprocessing module, type: {GcsUri: [text, python]}, description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions. Can be empty.'} + - {name: Transformed data dir, type: {GcsUri: Directory}, description: 'GCS or local directory'} #Also supports local paths +outputs: + - {name: Transformed data dir, type: {GcsUri: Directory}} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 + command: [python, /ml/transform.py] #python2.7 + args: [ + --train, {inputValue: Training data file pattern}, + --eval, {inputValue: Evaluation data file pattern}, + --schema, {inputValue: Schema}, + --project, {inputValue: GCP project}, + --mode, {inputValue: Run mode}, + --preprocessing-module, {inputValue: Preprocessing module}, + --output, {inputValue: Transformed data dir}, + ] + fileOutputs: + Transformed data dir: /output.txt diff --git a/components/kubeflow/dnntrainer/component.yaml b/components/kubeflow/dnntrainer/component.yaml new file mode 100644 index 00000000000..ebe60e52b74 --- /dev/null +++ b/components/kubeflow/dnntrainer/component.yaml @@ -0,0 +1,35 @@ +name: Train FC DNN using TF +description: | + Trains fully-connected neural network using Tensorflow + Input and output data is in GCS +inputs: + - {name: Transformed data dir, type: {GcsUri: Directory}, description: 'GCS path containing tf-transformed training and eval data.'} + - {name: Schema, type: {GcsUri: [text, json]}, description: 'GCS json schema file path.'} + - {name: Learning rate, type: Float, description: 'Learning rate for training.'} #default=0.1 +# - {name: Optimizer, type: {Enum: [Adam, SGD, Adagrad]}, description: 'Optimizer for training. If not provided, tf.estimator default will be used.'} #default='Adagrad' + - {name: Hidden layer size, type: String, description: 'Comma-separated hidden layer sizes. For example "200,100,50".'} #default='100' + - {name: Steps, type: Integer, description: 'Maximum number of training steps to perform. If unspecified, will honor epochs.'} +# - {name: Epochs, type: Integer, description: 'Maximum number of training data epochs on which to train. If both "steps" and "epochs" are specified, the training job will run for "steps" or "epochs", whichever occurs first.'} + - {name: Target, type: String, description: 'Name of the column for prediction target.'} + - {name: Preprocessing module, type: {GcsUri: [text, python]}, description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions. Can be empty.'} + - {name: Training output dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} +outputs: + - {name: Training output dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 + command: [python, -m, trainer.task] #python2.7 + args: [ + --transformed-data-dir, {inputValue: Transformed data dir}, + --schema, {inputValue: Schema}, + --learning-rate, {inputValue: Learning rate}, +# --optimizer, {inputValue: Optimizer}, + --hidden-layer-size, {inputValue: Hidden layer size}, + --steps, {inputValue: Steps}, +# --epochs, {inputValue: Epochs}, + --target, {inputValue: Target}, + --preprocessing-module, {inputValue: Preprocessing module}, + --job-dir, {inputValue: Training output dir}, + ] + fileOutputs: + Training output dir: /output.txt diff --git a/components/local/confusion_matrix/component.yaml b/components/local/confusion_matrix/component.yaml new file mode 100644 index 00000000000..d84e75dc6f5 --- /dev/null +++ b/components/local/confusion_matrix/component.yaml @@ -0,0 +1,20 @@ +name: Confusion matrix +description: Calculates confusion matrix +inputs: + - {name: Predictions, type: {GcsUri: [text, CSV]}, description: 'GCS path of prediction file pattern.'} + - {name: Output dir, type: {GcsUri: Directory}, description: 'GCS path of the output directory.'} +outputs: + - {name: UI metadata, type: UI metadata} + - {name: Metrics, type: Metrics} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 + command: [python, /ml/confusion_matrix.py] #python2.7 + args: [ + --predictions, {inputValue: Predictions}, + --output, {inputValue: Output dir}, + ] +#Argo deletes the source files as soon as it uploads them to the artifact store. Trying to output the same files as parameter outputs fails since the source files are already deleted. +# fileOutputs: +# UI metadata: /mlpipeline-ui-metadata.json +# Metrics: /mlpipeline-metrics.json diff --git a/components/local/roc/component.yaml b/components/local/roc/component.yaml new file mode 100644 index 00000000000..a48df71be3f --- /dev/null +++ b/components/local/roc/component.yaml @@ -0,0 +1,21 @@ +name: ROC curve +description: Calculates Receiver Operating Characteristic curve. See https://en.wikipedia.org/wiki/Receiver_operating_characteristic +inputs: + - {name: Predictions dir, type: {GcsUri: Directory}, description: 'GCS path of prediction file pattern.'} #TODO: Replace dir data + schema files + - {name: True class, type: String, description: 'The name of the class as true value.'} + - {name: Output dir, type: {GcsUri: Directory}, description: 'GCS path of the output directory.'} #TODO: Replace dir with single file +outputs: + - {name: UI metadata, type: UI metadata} + - {name: Metrics, type: Metrics} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 + command: [python, /ml/confusion_matrix.py] #python2.7 + args: [ + --predictions, {inputValue: Predictions}, + --trueclass, {inputValue: True class}, + --output, {inputValue: Output dir}, + ] + fileOutputs: + UI metadata: /mlpipeline-ui-metadata.json + Metrics: /mlpipeline-metrics.json diff --git a/samples/kubeflow-tf/kubeflow-training-classification.py b/samples/kubeflow-tf/kubeflow-training-classification.py index eb40cb42cc6..79420c240fa 100755 --- a/samples/kubeflow-tf/kubeflow-training-classification.py +++ b/samples/kubeflow-tf/kubeflow-training-classification.py @@ -13,75 +13,23 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys +from pathlib import Path import kfp.dsl as dsl import kfp.gcp as gcp -import datetime -def dataflow_tf_transform_op(train_data: 'GcsUri', evaluation_data: 'GcsUri', schema: 'GcsUri[text/json]', project: 'GcpProject', preprocess_mode, preprocess_module: 'GcsUri[text/code/python]', transform_output: 'GcsUri[Directory]', step_name='preprocess'): - return dsl.ContainerOp( - name = step_name, - image = 'gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:2c2445df83fa879387a200747cc20f72a7ee9727', - arguments = [ - '--train', train_data, - '--eval', evaluation_data, - '--schema', schema, - '--project', project, - '--mode', preprocess_mode, - '--preprocessing-module', preprocess_module, - '--output', transform_output, - ], - file_outputs = {'transformed': '/output.txt'} - ) +from kfp.components import ComponentStore +cs = ComponentStore() +cs.local_search_paths.append(str(Path(__file__).parent.joinpath('../../components/'))) #local repo checkout path +cs.url_search_prefixes.append('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/') +cs.url_search_prefixes.append('https://raw.githubusercontent.com/Ark-kun/pipelines/Added-component-definitions-to-our-components/components/') -def kubeflow_tf_training_op(transformed_data_dir, schema: 'GcsUri[text/json]', learning_rate: float, hidden_layer_size: int, steps: int, target, preprocess_module: 'GcsUri[text/code/python]', training_output: 'GcsUri[Directory]', step_name='training', use_gpu=False): - kubeflow_tf_training_op = dsl.ContainerOp( - name = step_name, - image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:2c2445df83fa879387a200747cc20f72a7ee9727', - arguments = [ - '--transformed-data-dir', transformed_data_dir, - '--schema', schema, - '--learning-rate', learning_rate, - '--hidden-layer-size', hidden_layer_size, - '--steps', steps, - '--target', target, - '--preprocessing-module', preprocess_module, - '--job-dir', training_output, - ], - file_outputs = {'train': '/output.txt'} - ) - if use_gpu: - kubeflow_tf_training_op.image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer-gpu:2c2445df83fa879387a200747cc20f72a7ee9727' - kubeflow_tf_training_op.set_gpu_limit(1) - - return kubeflow_tf_training_op - -def dataflow_tf_predict_op(evaluation_data: 'GcsUri', schema: 'GcsUri[text/json]', target: str, model: 'TensorFlow model', predict_mode, project: 'GcpProject', prediction_output: 'GcsUri', step_name='prediction'): - return dsl.ContainerOp( - name = step_name, - image = 'gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:2c2445df83fa879387a200747cc20f72a7ee9727', - arguments = [ - '--data', evaluation_data, - '--schema', schema, - '--target', target, - '--model', model, - '--mode', predict_mode, - '--project', project, - '--output', prediction_output, - ], - file_outputs = {'prediction': '/output.txt'} - ) - -def confusion_matrix_op(predictions, output, step_name='confusionmatrix'): - return dsl.ContainerOp( - name = step_name, - image = 'gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:2c2445df83fa879387a200747cc20f72a7ee9727', - arguments = [ - '--predictions', predictions, - '--output', output, - ] - ) +dataflow_tf_transform_op = cs.load_component('dataflow/tft') +kubeflow_tf_training_op = cs.load_component('kubeflow/dnntrainer') +dataflow_tf_predict_op = cs.load_component('dataflow/predict') +confusion_matrix_op = cs.load_component('local/confusion_matrix') @dsl.pipeline( name='Pipeline TFJob', @@ -105,7 +53,11 @@ def kubeflow_training(output, project, use_gpu = False preprocess = dataflow_tf_transform_op(train, evaluation, schema, project, preprocess_mode, '', '%s/%s/transformed' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) - training = kubeflow_tf_training_op(preprocess.output, schema, learning_rate, hidden_layer_size, steps, target, '', '%s/%s/train' % (output, workflow), use_gpu=use_gpu).apply(gcp.use_gcp_secret('user-gcp-sa')) + training = kubeflow_tf_training_op(preprocess.output, schema, learning_rate, hidden_layer_size, steps, target, '', '%s/%s/train' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) + if use_gpu: + training.image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer-gpu:2c2445df83fa879387a200747cc20f72a7ee9727', + training.set_gpu_limit(1) + prediction = dataflow_tf_predict_op(evaluation, schema, target, training.output, predict_mode, project, '%s/%s/predict' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) confusion_matrix = confusion_matrix_op(prediction.output, '%s/%s/confusionmatrix' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) From 42556f6480d9c567cb1f2877b41c8c6c56e1dd2c Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 25 Mar 2019 20:30:42 -0700 Subject: [PATCH 02/13] Added the definition for "TFX - Data Validation" --- components/dataflow/tfdv/component.yaml | 34 +++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 components/dataflow/tfdv/component.yaml diff --git a/components/dataflow/tfdv/component.yaml b/components/dataflow/tfdv/component.yaml new file mode 100644 index 00000000000..6e277089543 --- /dev/null +++ b/components/dataflow/tfdv/component.yaml @@ -0,0 +1,34 @@ +name: TFX - Data Validation +description: | + Runs Tensorflow Data Validation. https://www.tensorflow.org/tfx/data_validation/get_started + Tensorflow Data Validation (TFDV) can analyze training and serving data to: + * compute descriptive statistics, + * infer a schema, + * detect data anomalies. +inputs: +- {name: Inference data, type: {GcsPath: {data_type: CSV}}, description: GCS path of the CSV file from which to infer the schema.} +- {name: Validation data, type: {GcsPath: {data_type: CSV}}, description: GCS path of the CSV file whose contents should be validated.} +- {name: Column names, type: {GcsPath: {data_type: JSON}}, description: GCS json file containing a list of column names.} +- {name: Key columns, type: String, description: Comma separated list of columns to treat as keys.} +- {name: GCP Project, type: GcpProject, default: '', description: The GCP project to run the dataflow job.} +- {name: Mode, type: String, default: local, description: Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud". } +- {name: Validation output, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory.} +outputs: +- {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: GCS path of the inferred schema JSON.} +- {name: Validation result, type: String, description: Indicates whether anomalies were detected or not.} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfdv:3b949b37aa2cefd3180398d59116f43ce965a2a6 + command: [python2, /ml/validate.py] + args: [ + --csv-data-for-inference, {inputValue: Inference data}, + --csv-data-to-validate, {inputValue: Validation data}, + --column-names, {inputValue: Column names}, + --key-columns, {inputValue: Key columns}, + --project, {inputValue: Project}, + --mode, {inputValue: Mode}, + --output, {inputValue: Validation output}, + ] + fileOutputs: + Schema: /schema.txt + Validation result: /output_validation_result.txt \ No newline at end of file From 0cd0c05e952f1c675cd3f719ebc9a3fa6fdeb06e Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 25 Mar 2019 21:31:42 -0700 Subject: [PATCH 03/13] Added the definition for the "TFX - Analyze model" component --- components/dataflow/tfma/component.yaml | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 components/dataflow/tfma/component.yaml diff --git a/components/dataflow/tfma/component.yaml b/components/dataflow/tfma/component.yaml new file mode 100644 index 00000000000..ae322a4ae40 --- /dev/null +++ b/components/dataflow/tfma/component.yaml @@ -0,0 +1,32 @@ +name: TFX - Analyze model +description: | + Runs Tensorflow Model Analysis. https://www.tensorflow.org/tfx/model_analysis/get_started + TensorFlow Model Analysis allows you to perform model evaluations in the TFX pipeline, and view resultant metrics and plots in a Jupyter notebook. Specifically, it can provide: + * metrics computed on entire training and holdout dataset, as well as next-day evaluations + * tracking metrics over time + * model quality performance on different feature slices +inputs: +- {name: Model, type: {GcsPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: GCS path to the model which will be evaluated.} +- {name: Evaluation data, type: {GcsPath: {data_type: CSV}}, description: GCS path of eval files.} +- {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: GCS json schema file path.} +- {name: Run mode, type: String, default: local, description: whether to run the job locally or in Cloud Dataflow.} +- {name: GCP project, type: GcpProject, default: '', description: 'The GCP project to run the dataflow job, if running in the `cloud` mode.'} +- {name: Slice columns, type: String, description: Comma-separated list of columns on which to slice for analysis.} +- {name: Analysis results dir, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should be written.} +outputs: +- {name: Analysis results dir, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should were written.} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfma:3b949b37aa2cefd3180398d59116f43ce965a2a6 + command: [python2, /ml/model_analysis.py] + args: [ + --model, {inputValue: Model}, + --eval, {inputValue: Evaluation data}, + --schema, {inputValue: Schema}, + --mode, {inputValue: Run mode}, + --project, {inputValue: GCP project}, + --slice-columns, {inputValue: Slice columns}, + --output, {inputValue: Analysis results dir}, + ] + fileOutputs: + Analysis results dir: /output.txt From a13fdc1e26b5824ea64e594c3f5bf936a1637c3b Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 25 Mar 2019 21:31:59 -0700 Subject: [PATCH 04/13] Fixed bug in "ROC curve" --- components/local/roc/component.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/local/roc/component.yaml b/components/local/roc/component.yaml index a48df71be3f..b10187f83c8 100644 --- a/components/local/roc/component.yaml +++ b/components/local/roc/component.yaml @@ -12,7 +12,7 @@ implementation: image: gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 command: [python, /ml/confusion_matrix.py] #python2.7 args: [ - --predictions, {inputValue: Predictions}, + --predictions, {inputValue: Predictions dir}, --trueclass, {inputValue: True class}, --output, {inputValue: Output dir}, ] From 88273dd4b4d45d66f47f0b13beb7e4683455084a Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 25 Mar 2019 21:32:19 -0700 Subject: [PATCH 05/13] Updated "Predict using TF on Dataflow" --- components/dataflow/predict/component.yaml | 32 +++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/components/dataflow/predict/component.yaml b/components/dataflow/predict/component.yaml index 4ea14ecced6..6b05e2c340c 100644 --- a/components/dataflow/predict/component.yaml +++ b/components/dataflow/predict/component.yaml @@ -3,28 +3,28 @@ description: | Runs TensorFlow prediction on Google Cloud Dataflow Input and output data is in GCS inputs: - - {name: Data file pattern, type: {GcsUriPattern: [text, CSV]}, description: 'GCS or local path of test file patterns.'} - - {name: Schema, type: {GcsUri: [text, json]}, description: 'GCS json schema file path.'} - - {name: Target, type: String, description: 'Name of the column for prediction target.'} - - {name: Model, type: {GcsUri: [Directory, Exported TensorFlow models]}, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer + - {name: Data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS or local path of test file patterns.'} + - {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: 'GCS json schema file path.'} + - {name: Target column, type: String, description: 'Name of the column for prediction target.'} + - {name: Model, type: {GcsPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer # - {name: Batch size, type: Integer, default: 32, description: 'Batch size used in prediction.'} - - {name: Run mode, type: {Enum: [local, cloud]}, description: 'Whether to run the job locally or in Cloud Dataflow.'} - - {name: GCP project, type: GCP project, description: 'The GCP project to run the dataflow job.'} - - {name: Predictions dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".'} + - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} + - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file outputs: - - {name: Predictions dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 - command: [python, /ml/predict.py] #python2.7 + command: [python2, /ml/predict.py] args: [ - --data, {inputValue: Data file pattern}, - --schema, {inputValue: Schema}, - --target, {inputValue: Target}, - --model, {inputValue: Model}, - --mode, {inputValue: Run mode}, - --project, {inputValue: GCP project}, - --output, {inputValue: Predictions dir}, + --data, {inputValue: Data file pattern}, + --schema, {inputValue: Schema}, + --target, {inputValue: Target column}, + --model, {inputValue: Model}, + --mode, {inputValue: Run mode}, + --project, {inputValue: GCP project}, + --output, {inputValue: Predictions dir}, ] fileOutputs: Predictions dir: /output.txt From acc4385b2ddeb3d75c3c3230a6f9a8d4c21a7f24 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Mon, 25 Mar 2019 21:32:39 -0700 Subject: [PATCH 06/13] Updated "TFX - Data Validation" --- components/dataflow/tfdv/component.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/dataflow/tfdv/component.yaml b/components/dataflow/tfdv/component.yaml index 6e277089543..aa604bbefab 100644 --- a/components/dataflow/tfdv/component.yaml +++ b/components/dataflow/tfdv/component.yaml @@ -10,8 +10,8 @@ inputs: - {name: Validation data, type: {GcsPath: {data_type: CSV}}, description: GCS path of the CSV file whose contents should be validated.} - {name: Column names, type: {GcsPath: {data_type: JSON}}, description: GCS json file containing a list of column names.} - {name: Key columns, type: String, description: Comma separated list of columns to treat as keys.} -- {name: GCP Project, type: GcpProject, default: '', description: The GCP project to run the dataflow job.} -- {name: Mode, type: String, default: local, description: Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud". } +- {name: GCP project, type: GcpProject, default: '', description: The GCP project to run the dataflow job.} +- {name: Run mode, type: String, default: local, description: Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud". } - {name: Validation output, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory.} outputs: - {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: GCS path of the inferred schema JSON.} @@ -25,8 +25,8 @@ implementation: --csv-data-to-validate, {inputValue: Validation data}, --column-names, {inputValue: Column names}, --key-columns, {inputValue: Key columns}, - --project, {inputValue: Project}, - --mode, {inputValue: Mode}, + --project, {inputValue: GCP project}, + --mode, {inputValue: Run mode}, --output, {inputValue: Validation output}, ] fileOutputs: From 6af7f622155880a4046c50179b93f9624252b9be Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Tue, 26 Mar 2019 16:06:10 -0700 Subject: [PATCH 07/13] Updated the component definitions. --- components/dataflow/predict/component.yaml | 33 ++++++++-------- components/dataflow/tfdv/component.yaml | 2 +- components/dataflow/tfma/component.yaml | 2 +- components/dataflow/tft/component.yaml | 24 ++++++------ components/kubeflow/dnntrainer/component.yaml | 32 +++++++-------- .../local/confusion_matrix/component.yaml | 14 +++---- components/local/roc/component.yaml | 32 ++++++++------- .../kubeflow-training-classification.py | 4 -- samples/xgboost-spark/xgboost-training-cm.py | 39 ++++++------------- 9 files changed, 81 insertions(+), 101 deletions(-) diff --git a/components/dataflow/predict/component.yaml b/components/dataflow/predict/component.yaml index 6b05e2c340c..2e68d6a97d2 100644 --- a/components/dataflow/predict/component.yaml +++ b/components/dataflow/predict/component.yaml @@ -3,28 +3,29 @@ description: | Runs TensorFlow prediction on Google Cloud Dataflow Input and output data is in GCS inputs: - - {name: Data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS or local path of test file patterns.'} - - {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: 'GCS json schema file path.'} - - {name: Target column, type: String, description: 'Name of the column for prediction target.'} + - {name: Data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS or local path of test file patterns.'} + - {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: 'GCS json schema file path.'} + - {name: Target column, type: String, description: 'Name of the column for prediction target.'} - {name: Model, type: {GcsPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer -# - {name: Batch size, type: Integer, default: 32, description: 'Batch size used in prediction.'} - - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".'} - - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} - - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Batch size, type: Integer, default: '32', description: 'Batch size used in prediction.'} + - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".'} + - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} + - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file outputs: - - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file implementation: container: - image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:2c2445df83fa879387a200747cc20f72a7ee9727 command: [python2, /ml/predict.py] args: [ - --data, {inputValue: Data file pattern}, - --schema, {inputValue: Schema}, - --target, {inputValue: Target column}, - --model, {inputValue: Model}, - --mode, {inputValue: Run mode}, - --project, {inputValue: GCP project}, - --output, {inputValue: Predictions dir}, + --data, {inputValue: Data file pattern}, + --schema, {inputValue: Schema}, + --target, {inputValue: Target column}, + --model, {inputValue: Model}, + --mode, {inputValue: Run mode}, + --project, {inputValue: GCP project}, + --batchsize, {inputValue: Batch size}, + --output, {inputValue: Predictions dir}, ] fileOutputs: Predictions dir: /output.txt diff --git a/components/dataflow/tfdv/component.yaml b/components/dataflow/tfdv/component.yaml index aa604bbefab..f51c3cdfa46 100644 --- a/components/dataflow/tfdv/component.yaml +++ b/components/dataflow/tfdv/component.yaml @@ -18,7 +18,7 @@ outputs: - {name: Validation result, type: String, description: Indicates whether anomalies were detected or not.} implementation: container: - image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfdv:3b949b37aa2cefd3180398d59116f43ce965a2a6 + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfdv:2c2445df83fa879387a200747cc20f72a7ee9727 command: [python2, /ml/validate.py] args: [ --csv-data-for-inference, {inputValue: Inference data}, diff --git a/components/dataflow/tfma/component.yaml b/components/dataflow/tfma/component.yaml index ae322a4ae40..ca9d9d0e9c1 100644 --- a/components/dataflow/tfma/component.yaml +++ b/components/dataflow/tfma/component.yaml @@ -17,7 +17,7 @@ outputs: - {name: Analysis results dir, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should were written.} implementation: container: - image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfma:3b949b37aa2cefd3180398d59116f43ce965a2a6 + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfma:2c2445df83fa879387a200747cc20f72a7ee9727 command: [python2, /ml/model_analysis.py] args: [ --model, {inputValue: Model}, diff --git a/components/dataflow/tft/component.yaml b/components/dataflow/tft/component.yaml index 0e41a967d9a..a4bedbbc977 100644 --- a/components/dataflow/tft/component.yaml +++ b/components/dataflow/tft/component.yaml @@ -1,21 +1,19 @@ name: Transform using TF on Dataflow -description: | - Runs TensorFlow Transform on Google Cloud Dataflow - Input and output data is in GCS +description: Runs TensorFlow Transform on Google Cloud Dataflow inputs: - - {name: Training data file pattern, type: {GcsUriPattern: [text, CSV]}, description: 'GCS path of train file patterns.'} #Also supports local CSV - - {name: Evaluation data file pattern, type: {GcsUriPattern: [text, CSV]}, description: 'GCS path of eval file patterns.'} #Also supports local CSV - - {name: Schema, type: {GcsUri: [text, json]}, description: 'GCS json schema file path.'} - - {name: GCP project, type: GCP project, description: 'The GCP project to run the dataflow job.'} - - {name: Run mode, type: {Enum: [local, cloud]}, description: 'Whether to run the job locally or in Cloud Dataflow.'} - - {name: Preprocessing module, type: {GcsUri: [text, python]}, description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions. Can be empty.'} - - {name: Transformed data dir, type: {GcsUri: Directory}, description: 'GCS or local directory'} #Also supports local paths + - {name: Training data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS path of train file patterns.'} #Also supports local CSV + - {name: Evaluation data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS path of eval file patterns.'} #Also supports local CSV + - {name: Schema, type: {GcsPath: {data_type: JSON}}, description: 'GCS json schema file path.'} + - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} + - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".' } + - {name: Preprocessing module, type: {GcsPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} + - {name: Transformed data dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory'} #Also supports local paths outputs: - - {name: Transformed data dir, type: {GcsUri: Directory}} + - {name: Transformed data dir, type: {GcsPath: {path_type: Directory}}} implementation: container: - image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 - command: [python, /ml/transform.py] #python2.7 + image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:2c2445df83fa879387a200747cc20f72a7ee9727 + command: [python2, /ml/transform.py] args: [ --train, {inputValue: Training data file pattern}, --eval, {inputValue: Evaluation data file pattern}, diff --git a/components/kubeflow/dnntrainer/component.yaml b/components/kubeflow/dnntrainer/component.yaml index ebe60e52b74..1a63ea022bb 100644 --- a/components/kubeflow/dnntrainer/component.yaml +++ b/components/kubeflow/dnntrainer/component.yaml @@ -1,29 +1,27 @@ name: Train FC DNN using TF -description: | - Trains fully-connected neural network using Tensorflow - Input and output data is in GCS +description: Trains fully-connected neural network using Tensorflow inputs: - - {name: Transformed data dir, type: {GcsUri: Directory}, description: 'GCS path containing tf-transformed training and eval data.'} - - {name: Schema, type: {GcsUri: [text, json]}, description: 'GCS json schema file path.'} - - {name: Learning rate, type: Float, description: 'Learning rate for training.'} #default=0.1 -# - {name: Optimizer, type: {Enum: [Adam, SGD, Adagrad]}, description: 'Optimizer for training. If not provided, tf.estimator default will be used.'} #default='Adagrad' - - {name: Hidden layer size, type: String, description: 'Comma-separated hidden layer sizes. For example "200,100,50".'} #default='100' - - {name: Steps, type: Integer, description: 'Maximum number of training steps to perform. If unspecified, will honor epochs.'} -# - {name: Epochs, type: Integer, description: 'Maximum number of training data epochs on which to train. If both "steps" and "epochs" are specified, the training job will run for "steps" or "epochs", whichever occurs first.'} - - {name: Target, type: String, description: 'Name of the column for prediction target.'} - - {name: Preprocessing module, type: {GcsUri: [text, python]}, description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions. Can be empty.'} - - {name: Training output dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} + - {name: Transformed data dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path containing tf-transformed training and eval data.'} + - {name: Schema, type: {GcsPath: {data_type: JSON}}, description: 'GCS json schema file path.'} + - {name: Learning rate, type: Float, default: '0.1', description: 'Learning rate for training.'} + - {name: Optimizer, type: String, default: 'Adagrad', description: 'Optimizer for training. Valid values are: Adam, SGD, Adagrad. If not provided, tf.estimator default will be used.'} + - {name: Hidden layer size, type: String, default: '100', description: 'Comma-separated hidden layer sizes. For example "200,100,50".'} + - {name: Steps, type: Integer, description: 'Maximum number of training steps to perform. If unspecified, will honor epochs.'} + #- {name: Epochs, type: Integer, default: '', description: 'Maximum number of training data epochs on which to train. If both "steps" and "epochs" are specified, the training job will run for "steps" or "epochs", whichever occurs first.'} + - {name: Target, type: String, description: 'Name of the column for prediction target.'} + - {name: Preprocessing module, type: {GcsPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} + - {name: Training output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} outputs: - - {name: Training output dir, type: {GcsUri: Directory}, description: 'GCS or local directory.'} + - {name: Training output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} implementation: container: - image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 - command: [python, -m, trainer.task] #python2.7 + image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:2c2445df83fa879387a200747cc20f72a7ee9727 + command: [python2, -m, trainer.task] args: [ --transformed-data-dir, {inputValue: Transformed data dir}, --schema, {inputValue: Schema}, --learning-rate, {inputValue: Learning rate}, -# --optimizer, {inputValue: Optimizer}, + --optimizer, {inputValue: Optimizer}, --hidden-layer-size, {inputValue: Hidden layer size}, --steps, {inputValue: Steps}, # --epochs, {inputValue: Epochs}, diff --git a/components/local/confusion_matrix/component.yaml b/components/local/confusion_matrix/component.yaml index d84e75dc6f5..b7925d6ce5c 100644 --- a/components/local/confusion_matrix/component.yaml +++ b/components/local/confusion_matrix/component.yaml @@ -1,15 +1,15 @@ name: Confusion matrix description: Calculates confusion matrix inputs: - - {name: Predictions, type: {GcsUri: [text, CSV]}, description: 'GCS path of prediction file pattern.'} - - {name: Output dir, type: {GcsUri: Directory}, description: 'GCS path of the output directory.'} -outputs: - - {name: UI metadata, type: UI metadata} - - {name: Metrics, type: Metrics} + - {name: Predictions, type: {GcsPath: {data_type: CSV}}, description: 'GCS path of prediction file pattern.'} + - {name: Output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} +#outputs: +# - {name: UI metadata, type: UI metadata} +# - {name: Metrics, type: Metrics} implementation: container: - image: gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 - command: [python, /ml/confusion_matrix.py] #python2.7 + image: gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:2c2445df83fa879387a200747cc20f72a7ee9727 + command: [python2, /ml/confusion_matrix.py] args: [ --predictions, {inputValue: Predictions}, --output, {inputValue: Output dir}, diff --git a/components/local/roc/component.yaml b/components/local/roc/component.yaml index b10187f83c8..02b8a180ee0 100644 --- a/components/local/roc/component.yaml +++ b/components/local/roc/component.yaml @@ -1,21 +1,25 @@ name: ROC curve description: Calculates Receiver Operating Characteristic curve. See https://en.wikipedia.org/wiki/Receiver_operating_characteristic inputs: - - {name: Predictions dir, type: {GcsUri: Directory}, description: 'GCS path of prediction file pattern.'} #TODO: Replace dir data + schema files - - {name: True class, type: String, description: 'The name of the class as true value.'} - - {name: Output dir, type: {GcsUri: Directory}, description: 'GCS path of the output directory.'} #TODO: Replace dir with single file -outputs: - - {name: UI metadata, type: UI metadata} - - {name: Metrics, type: Metrics} + - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path of prediction file pattern.'} #TODO: Replace dir data + schema files + - {name: True class, type: String, default: 'true', description: 'The true class label for the sample. Default is "true".'} + - {name: True score column, type: String, default: 'true', description: 'The name of the column for positive probability.'} + - {name: Target lambda, type: String, default: '', description: 'Text of Python lambda function which returns boolean value indicating whether the classification result is correct.\nFor example, "lambda x: x[''a''] and x[''b'']". If missing, input must have a "target" column.'} + - {name: Output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} #TODO: Replace dir with single file +#outputs: +# - {name: UI metadata, type: UI metadata} +# - {name: Metrics, type: Metrics} implementation: container: - image: gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:85c6413a2e13da4b8f198aeac1abc2f3a74fe789 - command: [python, /ml/confusion_matrix.py] #python2.7 + image: gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:2c2445df83fa879387a200747cc20f72a7ee9727 + command: [python2, /ml/confusion_matrix.py] args: [ - --predictions, {inputValue: Predictions dir}, - --trueclass, {inputValue: True class}, - --output, {inputValue: Output dir}, + --predictions, {inputValue: Predictions dir}, + --trueclass, {inputValue: True class}, + --true_score_column, {inputValue: True score column}, + --target_lambda, {inputValue: Target lambda}, + --output, {inputValue: Output dir}, ] - fileOutputs: - UI metadata: /mlpipeline-ui-metadata.json - Metrics: /mlpipeline-metrics.json +# fileOutputs: +# UI metadata: /mlpipeline-ui-metadata.json +# Metrics: /mlpipeline-metrics.json diff --git a/samples/kubeflow-tf/kubeflow-training-classification.py b/samples/kubeflow-tf/kubeflow-training-classification.py index 79420c240fa..69bb0cee79f 100755 --- a/samples/kubeflow-tf/kubeflow-training-classification.py +++ b/samples/kubeflow-tf/kubeflow-training-classification.py @@ -13,16 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys -from pathlib import Path - import kfp.dsl as dsl import kfp.gcp as gcp from kfp.components import ComponentStore cs = ComponentStore() -cs.local_search_paths.append(str(Path(__file__).parent.joinpath('../../components/'))) #local repo checkout path cs.url_search_prefixes.append('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/') cs.url_search_prefixes.append('https://raw.githubusercontent.com/Ark-kun/pipelines/Added-component-definitions-to-our-components/components/') diff --git a/samples/xgboost-spark/xgboost-training-cm.py b/samples/xgboost-spark/xgboost-training-cm.py index b828153389d..69021c38dbe 100755 --- a/samples/xgboost-spark/xgboost-training-cm.py +++ b/samples/xgboost-spark/xgboost-training-cm.py @@ -17,6 +17,15 @@ import kfp.dsl as dsl import kfp.gcp as gcp +from kfp.components import ComponentStore + +cs = ComponentStore() +cs.url_search_prefixes.append('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/') +cs.url_search_prefixes.append('https://raw.githubusercontent.com/Ark-kun/pipelines/Added-component-definitions-to-our-components/components/') + +confusion_matrix_op = cs.load_component('local/confusion_matrix') +roc_op = cs.load_component('local/roc') + # ================================================================ # The following classes should be provided by components provider. @@ -135,32 +144,6 @@ def __init__(self, name, project, region, cluster_name, data, model, target, ana ], file_outputs={'output': '/output.txt'}) - -class ConfusionMatrixOp(dsl.ContainerOp): - - def __init__(self, name, predictions, output): - super(ConfusionMatrixOp, self).__init__( - name=name, - image='gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:2c2445df83fa879387a200747cc20f72a7ee9727', - arguments=[ - '--output', output, - '--predictions', predictions - ]) - - -class RocOp(dsl.ContainerOp): - - def __init__(self, name, predictions, trueclass, output): - super(RocOp, self).__init__( - name=name, - image='gcr.io/ml-pipeline/ml-pipeline-local-roc:2c2445df83fa879387a200747cc20f72a7ee9727', - arguments=[ - '--output', output, - '--predictions', predictions, - '--trueclass', trueclass, - '--true_score_column', trueclass, - ]) - # ======================================================================= @dsl.pipeline( @@ -197,10 +180,10 @@ def xgb_train_pipeline( predict_op = PredictOp('predict', project, region, create_cluster_op.output, transform_op.outputs['eval'], train_op.output, target, analyze_op.output, '%s/{{workflow.name}}/predict' % output).apply(gcp.use_gcp_secret('user-gcp-sa')) - confusion_matrix_op = ConfusionMatrixOp('confusion-matrix', predict_op.output, + confusion_matrix_task = confusion_matrix_op(predict_op.output, '%s/{{workflow.name}}/confusionmatrix' % output).apply(gcp.use_gcp_secret('user-gcp-sa')) - roc_op = RocOp('roc', predict_op.output, true_label, '%s/{{workflow.name}}/roc' % output).apply(gcp.use_gcp_secret('user-gcp-sa')) + roc_task = roc_op(predict_op.output, true_label, '%s/{{workflow.name}}/roc' % output).apply(gcp.use_gcp_secret('user-gcp-sa')) if __name__ == '__main__': import kfp.compiler as compiler From 56616aeb19e82ddcb7713ce64afcace3cf1b4477 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Tue, 26 Mar 2019 17:49:51 -0700 Subject: [PATCH 08/13] Updated the pipeline to make the lines shorter and explicitly name the function parameters --- .../kubeflow-training-classification.py | 39 +++++++++++++++++-- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/samples/kubeflow-tf/kubeflow-training-classification.py b/samples/kubeflow-tf/kubeflow-training-classification.py index 69bb0cee79f..36e2a67be5b 100755 --- a/samples/kubeflow-tf/kubeflow-training-classification.py +++ b/samples/kubeflow-tf/kubeflow-training-classification.py @@ -48,14 +48,45 @@ def kubeflow_training(output, project, # set the flag to use GPU trainer use_gpu = False - preprocess = dataflow_tf_transform_op(train, evaluation, schema, project, preprocess_mode, '', '%s/%s/transformed' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) - training = kubeflow_tf_training_op(preprocess.output, schema, learning_rate, hidden_layer_size, steps, target, '', '%s/%s/train' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) + preprocess = dataflow_tf_transform_op( + training_data_file_pattern=train, + evaluation_data_file_pattern=evaluation, + schema=schema, + gcp_project=project, + run_mode=preprocess_mode, + preprocessing_module='', + transformed_data_dir='%s/%s/transformed' % (output, workflow) + ).apply(gcp.use_gcp_secret('user-gcp-sa')) + + training = kubeflow_tf_training_op( + transformed_data_dir=preprocess.output, + schema=schema, + learning_rate=learning_rate, + hidden_layer_size=hidden_layer_size, + steps=steps, + target=target, + preprocessing_module='', + training_output_dir='%s/%s/train' % (output, workflow) + ).apply(gcp.use_gcp_secret('user-gcp-sa')) + if use_gpu: training.image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer-gpu:2c2445df83fa879387a200747cc20f72a7ee9727', training.set_gpu_limit(1) - prediction = dataflow_tf_predict_op(evaluation, schema, target, training.output, predict_mode, project, '%s/%s/predict' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) - confusion_matrix = confusion_matrix_op(prediction.output, '%s/%s/confusionmatrix' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) + prediction = dataflow_tf_predict_op( + data_file_pattern=evaluation, + schema=schema, + target_column=target, + model=training.output, + run_mode=predict_mode, + gcp_project=project, + predictions_dir='%s/%s/predict' % (output, workflow) + ).apply(gcp.use_gcp_secret('user-gcp-sa')) + + confusion_matrix = confusion_matrix_op( + predictions=prediction.output, + output_dir='%s/%s/confusionmatrix' % (output, workflow) + ).apply(gcp.use_gcp_secret('user-gcp-sa')) if __name__ == '__main__': From c3009e22702bf8177f6bc40fb0ef565c623f6577 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Tue, 26 Mar 2019 22:29:07 -0700 Subject: [PATCH 09/13] Changed the GCSPath type casing --- components/dataflow/predict/component.yaml | 10 +++++----- components/dataflow/tfdv/component.yaml | 10 +++++----- components/dataflow/tfma/component.yaml | 10 +++++----- components/dataflow/tft/component.yaml | 12 ++++++------ components/kubeflow/dnntrainer/component.yaml | 10 +++++----- components/local/confusion_matrix/component.yaml | 4 ++-- components/local/roc/component.yaml | 4 ++-- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/components/dataflow/predict/component.yaml b/components/dataflow/predict/component.yaml index 2e68d6a97d2..66a0f367c36 100644 --- a/components/dataflow/predict/component.yaml +++ b/components/dataflow/predict/component.yaml @@ -3,16 +3,16 @@ description: | Runs TensorFlow prediction on Google Cloud Dataflow Input and output data is in GCS inputs: - - {name: Data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS or local path of test file patterns.'} - - {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: 'GCS json schema file path.'} + - {name: Data file pattern, type: {GCSPath: {data_type: CSV}}, description: 'GCS or local path of test file patterns.'} + - {name: Schema, type: {GCSPath: {data_type: TFDV schema JSON}}, description: 'GCS json schema file path.'} - {name: Target column, type: String, description: 'Name of the column for prediction target.'} - - {name: Model, type: {GcsPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer + - {name: Model, type: {GCSPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer - {name: Batch size, type: Integer, default: '32', description: 'Batch size used in prediction.'} - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".'} - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} - - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Predictions dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file outputs: - - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Predictions dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/dataflow/tfdv/component.yaml b/components/dataflow/tfdv/component.yaml index f51c3cdfa46..4e9bd8eae80 100644 --- a/components/dataflow/tfdv/component.yaml +++ b/components/dataflow/tfdv/component.yaml @@ -6,15 +6,15 @@ description: | * infer a schema, * detect data anomalies. inputs: -- {name: Inference data, type: {GcsPath: {data_type: CSV}}, description: GCS path of the CSV file from which to infer the schema.} -- {name: Validation data, type: {GcsPath: {data_type: CSV}}, description: GCS path of the CSV file whose contents should be validated.} -- {name: Column names, type: {GcsPath: {data_type: JSON}}, description: GCS json file containing a list of column names.} +- {name: Inference data, type: {GCSPath: {data_type: CSV}}, description: GCS path of the CSV file from which to infer the schema.} +- {name: Validation data, type: {GCSPath: {data_type: CSV}}, description: GCS path of the CSV file whose contents should be validated.} +- {name: Column names, type: {GCSPath: {data_type: JSON}}, description: GCS json file containing a list of column names.} - {name: Key columns, type: String, description: Comma separated list of columns to treat as keys.} - {name: GCP project, type: GcpProject, default: '', description: The GCP project to run the dataflow job.} - {name: Run mode, type: String, default: local, description: Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud". } -- {name: Validation output, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory.} +- {name: Validation output, type: {GCSPath: {path_type: Directory}}, description: GCS or local directory.} outputs: -- {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: GCS path of the inferred schema JSON.} +- {name: Schema, type: {GCSPath: {data_type: TFDV schema JSON}}, description: GCS path of the inferred schema JSON.} - {name: Validation result, type: String, description: Indicates whether anomalies were detected or not.} implementation: container: diff --git a/components/dataflow/tfma/component.yaml b/components/dataflow/tfma/component.yaml index ca9d9d0e9c1..cbd0a7be19e 100644 --- a/components/dataflow/tfma/component.yaml +++ b/components/dataflow/tfma/component.yaml @@ -6,15 +6,15 @@ description: | * tracking metrics over time * model quality performance on different feature slices inputs: -- {name: Model, type: {GcsPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: GCS path to the model which will be evaluated.} -- {name: Evaluation data, type: {GcsPath: {data_type: CSV}}, description: GCS path of eval files.} -- {name: Schema, type: {GcsPath: {data_type: TFDV schema JSON}}, description: GCS json schema file path.} +- {name: Model, type: {GCSPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: GCS path to the model which will be evaluated.} +- {name: Evaluation data, type: {GCSPath: {data_type: CSV}}, description: GCS path of eval files.} +- {name: Schema, type: {GCSPath: {data_type: TFDV schema JSON}}, description: GCS json schema file path.} - {name: Run mode, type: String, default: local, description: whether to run the job locally or in Cloud Dataflow.} - {name: GCP project, type: GcpProject, default: '', description: 'The GCP project to run the dataflow job, if running in the `cloud` mode.'} - {name: Slice columns, type: String, description: Comma-separated list of columns on which to slice for analysis.} -- {name: Analysis results dir, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should be written.} +- {name: Analysis results dir, type: {GCSPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should be written.} outputs: -- {name: Analysis results dir, type: {GcsPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should were written.} +- {name: Analysis results dir, type: {GCSPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should were written.} implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfma:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/dataflow/tft/component.yaml b/components/dataflow/tft/component.yaml index a4bedbbc977..0a7909e8f64 100644 --- a/components/dataflow/tft/component.yaml +++ b/components/dataflow/tft/component.yaml @@ -1,15 +1,15 @@ name: Transform using TF on Dataflow description: Runs TensorFlow Transform on Google Cloud Dataflow inputs: - - {name: Training data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS path of train file patterns.'} #Also supports local CSV - - {name: Evaluation data file pattern, type: {GcsPath: {data_type: CSV}}, description: 'GCS path of eval file patterns.'} #Also supports local CSV - - {name: Schema, type: {GcsPath: {data_type: JSON}}, description: 'GCS json schema file path.'} + - {name: Training data file pattern, type: {GCSPath: {data_type: CSV}}, description: 'GCS path of train file patterns.'} #Also supports local CSV + - {name: Evaluation data file pattern, type: {GCSPath: {data_type: CSV}}, description: 'GCS path of eval file patterns.'} #Also supports local CSV + - {name: Schema, type: {GCSPath: {data_type: JSON}}, description: 'GCS json schema file path.'} - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".' } - - {name: Preprocessing module, type: {GcsPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} - - {name: Transformed data dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory'} #Also supports local paths + - {name: Preprocessing module, type: {GCSPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} + - {name: Transformed data dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory'} #Also supports local paths outputs: - - {name: Transformed data dir, type: {GcsPath: {path_type: Directory}}} + - {name: Transformed data dir, type: {GCSPath: {path_type: Directory}}} implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/kubeflow/dnntrainer/component.yaml b/components/kubeflow/dnntrainer/component.yaml index 1a63ea022bb..714f3944874 100644 --- a/components/kubeflow/dnntrainer/component.yaml +++ b/components/kubeflow/dnntrainer/component.yaml @@ -1,18 +1,18 @@ name: Train FC DNN using TF description: Trains fully-connected neural network using Tensorflow inputs: - - {name: Transformed data dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path containing tf-transformed training and eval data.'} - - {name: Schema, type: {GcsPath: {data_type: JSON}}, description: 'GCS json schema file path.'} + - {name: Transformed data dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path containing tf-transformed training and eval data.'} + - {name: Schema, type: {GCSPath: {data_type: JSON}}, description: 'GCS json schema file path.'} - {name: Learning rate, type: Float, default: '0.1', description: 'Learning rate for training.'} - {name: Optimizer, type: String, default: 'Adagrad', description: 'Optimizer for training. Valid values are: Adam, SGD, Adagrad. If not provided, tf.estimator default will be used.'} - {name: Hidden layer size, type: String, default: '100', description: 'Comma-separated hidden layer sizes. For example "200,100,50".'} - {name: Steps, type: Integer, description: 'Maximum number of training steps to perform. If unspecified, will honor epochs.'} #- {name: Epochs, type: Integer, default: '', description: 'Maximum number of training data epochs on which to train. If both "steps" and "epochs" are specified, the training job will run for "steps" or "epochs", whichever occurs first.'} - {name: Target, type: String, description: 'Name of the column for prediction target.'} - - {name: Preprocessing module, type: {GcsPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} - - {name: Training output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} + - {name: Preprocessing module, type: {GCSPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} + - {name: Training output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} outputs: - - {name: Training output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS or local directory.'} + - {name: Training output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/local/confusion_matrix/component.yaml b/components/local/confusion_matrix/component.yaml index b7925d6ce5c..68d54a8df34 100644 --- a/components/local/confusion_matrix/component.yaml +++ b/components/local/confusion_matrix/component.yaml @@ -1,8 +1,8 @@ name: Confusion matrix description: Calculates confusion matrix inputs: - - {name: Predictions, type: {GcsPath: {data_type: CSV}}, description: 'GCS path of prediction file pattern.'} - - {name: Output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} + - {name: Predictions, type: {GCSPath: {data_type: CSV}}, description: 'GCS path of prediction file pattern.'} + - {name: Output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} #outputs: # - {name: UI metadata, type: UI metadata} # - {name: Metrics, type: Metrics} diff --git a/components/local/roc/component.yaml b/components/local/roc/component.yaml index 02b8a180ee0..5bcec836a26 100644 --- a/components/local/roc/component.yaml +++ b/components/local/roc/component.yaml @@ -1,11 +1,11 @@ name: ROC curve description: Calculates Receiver Operating Characteristic curve. See https://en.wikipedia.org/wiki/Receiver_operating_characteristic inputs: - - {name: Predictions dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path of prediction file pattern.'} #TODO: Replace dir data + schema files + - {name: Predictions dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path of prediction file pattern.'} #TODO: Replace dir data + schema files - {name: True class, type: String, default: 'true', description: 'The true class label for the sample. Default is "true".'} - {name: True score column, type: String, default: 'true', description: 'The name of the column for positive probability.'} - {name: Target lambda, type: String, default: '', description: 'Text of Python lambda function which returns boolean value indicating whether the classification result is correct.\nFor example, "lambda x: x[''a''] and x[''b'']". If missing, input must have a "target" column.'} - - {name: Output dir, type: {GcsPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} #TODO: Replace dir with single file + - {name: Output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} #TODO: Replace dir with single file #outputs: # - {name: UI metadata, type: UI metadata} # - {name: Metrics, type: Metrics} From 8ab82151ddf86981d4aeb1e3329fd810caa6b90c Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Tue, 2 Apr 2019 18:19:02 -0700 Subject: [PATCH 10/13] Added the definition for the "Kubeflow - Serve TF model" component --- components/kubeflow/deployer/component.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 components/kubeflow/deployer/component.yaml diff --git a/components/kubeflow/deployer/component.yaml b/components/kubeflow/deployer/component.yaml new file mode 100644 index 00000000000..34d6a94f4fd --- /dev/null +++ b/components/kubeflow/deployer/component.yaml @@ -0,0 +1,21 @@ +name: Kubeflow - Serve TF model +description: Serve TensorFlow model using Kubeflow TF-serving +inputs: + - {name: Model dir, type: {GCSPath: {path_type: Directory}}, description: 'Path of GCS directory containing exported Tensorflow model.'} + - {name: Cluster name, type: String, default: '', description: 'Kubernetes cluster name where the TS-serving service should be deployed. Uses the current cluster by default.'} + - {name: Namespace, type: String, default: 'kubeflow', description: 'Kubernetes namespace where the TS-serving service should be deployed.'} + - {name: Server name, type: String, default: 'model-server', description: 'TF-serving server name to use when deploying.'} + - {name: PVC name, type: String, default: '' , description: 'Optional PersistentVolumeClaim to use.'} +#outputs: +# - {name: Endppoint URI, type: Serving URI, description: 'URI of the deployed prediction service..'} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:2c2445df83fa879387a200747cc20f72a7ee9727 + command: [/bin/deploy.sh] + args: [ + --model-export-path, {inputValue: Model dir}, + --cluster-name, {inputValue: Cluster name}, + --namespace, {inputValue: Namespace}, + --server-name, {inputValue: Server name}, + --pvc-name, {inputValue: PVC name}, + ] From 3fce31858e2e557268049302c39b1296baf0e1d5 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Tue, 2 Apr 2019 20:02:47 -0700 Subject: [PATCH 11/13] Added the definition for the "Kubeflow - Launch StudyJob" component --- .../kubeflow/katib-launcher/component.yaml | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 components/kubeflow/katib-launcher/component.yaml diff --git a/components/kubeflow/katib-launcher/component.yaml b/components/kubeflow/katib-launcher/component.yaml new file mode 100644 index 00000000000..0cbcdf4c88a --- /dev/null +++ b/components/kubeflow/katib-launcher/component.yaml @@ -0,0 +1,38 @@ +name: Kubeflow - Launch StudyJob +description: Kubeflow StudyJob launcher +inputs: +- {name: StudyJob name, type: String, description: 'Job name.'} +- {name: Namespace, type: String, default: kubeflow, description: 'Namespace.'} +- {name: Optimization type, type: String, default: minimize, description: 'Direction of optimization. minimize or maximize.'} +- {name: Objective value name, type: String, description: 'Objective value name which trainer optimizes.'} +- {name: Optimization goal, type: Float, description: 'Stop studying once objectivevaluename value exceeds optimizationgoal'} +- {name: Request count, type: Integer, default: 1, description: 'Number of requests to the suggestion service.'} +- {name: Metrics names, type: String, description: 'List of metric names (comma-delimited).'} +- {name: Parameter configs, type: YAML, default: '', description: 'Parameter configs (YAML/JSON format).'} +- {name: NAS config, type: YAML, default: '', description: 'NAS config (YAML/JSON format).'} +- {name: Worker template path, type: String, default: '', description: 'Worker spec.'} +- {name: Metrics collector template path, type: String, default: '', description: 'Metrics collector spec.'} +- {name: Suggestion spec, type: YAML, default: '', description: 'Suggestion spec (YAML/JSON format).'} +- {name: StudyJob timeout minutes, type: Integer, default: '10', description: 'Time in minutes to wait for the StudyJob to complete'} +outputs: +- {name: Best parameter set, type: JSON, description: 'The parameter set of the best StudyJob trial.'} +implementation: + container: + image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-studyjob:2c2445df83fa879387a200747cc20f72a7ee9727 + command: [python, /ml/launch_study_job.py] + args: [ + --name, {inputValue: StudyJob name}, + --namespace, {inputValue: Namespace}, + --optimizationtype, {inputValue: Optimization type}, + --objectivevaluename, {inputValue: Objective value name}, + --optimizationgoal, {inputValue: Optimization goal}, + --requestcount, {inputValue: Request count}, + --metricsnames, {inputValue: Metrics names}, + --parameterconfigs, {inputValue: Parameter configs}, + --nasConfig, {inputValue: NAS config}, + --workertemplatepath, {inputValue: Worker template path}, + --mcollectortemplatepath, {inputValue: Metrics collector template path}, + --suggestionspec, {inputValue: Suggestion spec}, + --studyjobtimeoutminutes, {inputValue: StudyJob timeout minutes}, + --outputfile, {outputPath: Best parameter set}, + ] From dd55d253ab639f395d54eba182b5ce534bb09d8d Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 3 Apr 2019 18:19:59 -0700 Subject: [PATCH 12/13] Removed all properties from GCPPath This will confuse our users and make type checking worse, but Hongye and Ajay requested that. `s/type: (\{GCSPath:.*?}})(.*)/type: GCPPath$2 # type: $1/g` --- components/dataflow/predict/component.yaml | 10 +++++----- components/dataflow/tfdv/component.yaml | 10 +++++----- components/dataflow/tfma/component.yaml | 10 +++++----- components/dataflow/tft/component.yaml | 12 ++++++------ components/kubeflow/deployer/component.yaml | 2 +- components/kubeflow/dnntrainer/component.yaml | 10 +++++----- components/local/confusion_matrix/component.yaml | 4 ++-- components/local/roc/component.yaml | 4 ++-- 8 files changed, 31 insertions(+), 31 deletions(-) diff --git a/components/dataflow/predict/component.yaml b/components/dataflow/predict/component.yaml index 66a0f367c36..0059dc4b993 100644 --- a/components/dataflow/predict/component.yaml +++ b/components/dataflow/predict/component.yaml @@ -3,16 +3,16 @@ description: | Runs TensorFlow prediction on Google Cloud Dataflow Input and output data is in GCS inputs: - - {name: Data file pattern, type: {GCSPath: {data_type: CSV}}, description: 'GCS or local path of test file patterns.'} - - {name: Schema, type: {GCSPath: {data_type: TFDV schema JSON}}, description: 'GCS json schema file path.'} + - {name: Data file pattern, type: GCPPath, description: 'GCS or local path of test file patterns.'} # type: {GCSPath: {data_type: CSV}} + - {name: Schema, type: GCPPath, description: 'GCS json schema file path.'} # type: {GCSPath: {data_type: TFDV schema JSON}} - {name: Target column, type: String, description: 'Name of the column for prediction target.'} - - {name: Model, type: {GCSPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer + - {name: Model, type: GCPPath, description: 'GCS or local path of model trained with tft preprocessed data.'} # Models trained with estimator are exported to base/export/export/123456781 directory. # Our trainer export only one model. #TODO: Output single model from trainer # type: {GCSPath: {path_type: Directory, data_type: Exported TensorFlow models dir}} - {name: Batch size, type: Integer, default: '32', description: 'Batch size used in prediction.'} - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".'} - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} - - {name: Predictions dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Predictions dir, type: GCPPath, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file # type: {GCSPath: {path_type: Directory}} outputs: - - {name: Predictions dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file + - {name: Predictions dir, type: GCPPath, description: 'GCS or local directory.'} #Will contain prediction_results-* and schema.json files; TODO: Split outputs and replace dir with single file # type: {GCSPath: {path_type: Directory}} implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/dataflow/tfdv/component.yaml b/components/dataflow/tfdv/component.yaml index 4e9bd8eae80..63028ee795a 100644 --- a/components/dataflow/tfdv/component.yaml +++ b/components/dataflow/tfdv/component.yaml @@ -6,15 +6,15 @@ description: | * infer a schema, * detect data anomalies. inputs: -- {name: Inference data, type: {GCSPath: {data_type: CSV}}, description: GCS path of the CSV file from which to infer the schema.} -- {name: Validation data, type: {GCSPath: {data_type: CSV}}, description: GCS path of the CSV file whose contents should be validated.} -- {name: Column names, type: {GCSPath: {data_type: JSON}}, description: GCS json file containing a list of column names.} +- {name: Inference data, type: GCPPath, description: GCS path of the CSV file from which to infer the schema.} # type: {GCSPath: {data_type: CSV}} +- {name: Validation data, type: GCPPath, description: GCS path of the CSV file whose contents should be validated.} # type: {GCSPath: {data_type: CSV}} +- {name: Column names, type: GCPPath, description: GCS json file containing a list of column names.} # type: {GCSPath: {data_type: JSON}} - {name: Key columns, type: String, description: Comma separated list of columns to treat as keys.} - {name: GCP project, type: GcpProject, default: '', description: The GCP project to run the dataflow job.} - {name: Run mode, type: String, default: local, description: Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud". } -- {name: Validation output, type: {GCSPath: {path_type: Directory}}, description: GCS or local directory.} +- {name: Validation output, type: GCPPath, description: GCS or local directory.} # type: {GCSPath: {path_type: Directory}} outputs: -- {name: Schema, type: {GCSPath: {data_type: TFDV schema JSON}}, description: GCS path of the inferred schema JSON.} +- {name: Schema, type: GCPPath, description: GCS path of the inferred schema JSON.} # type: {GCSPath: {data_type: TFDV schema JSON}} - {name: Validation result, type: String, description: Indicates whether anomalies were detected or not.} implementation: container: diff --git a/components/dataflow/tfma/component.yaml b/components/dataflow/tfma/component.yaml index cbd0a7be19e..95797556e3a 100644 --- a/components/dataflow/tfma/component.yaml +++ b/components/dataflow/tfma/component.yaml @@ -6,15 +6,15 @@ description: | * tracking metrics over time * model quality performance on different feature slices inputs: -- {name: Model, type: {GCSPath: {path_type: Directory, data_type: Exported TensorFlow models dir}}, description: GCS path to the model which will be evaluated.} -- {name: Evaluation data, type: {GCSPath: {data_type: CSV}}, description: GCS path of eval files.} -- {name: Schema, type: {GCSPath: {data_type: TFDV schema JSON}}, description: GCS json schema file path.} +- {name: Model, type: GCPPath, description: GCS path to the model which will be evaluated.} # type: {GCSPath: {path_type: Directory, data_type: Exported TensorFlow models dir}} +- {name: Evaluation data, type: GCPPath, description: GCS path of eval files.} # type: {GCSPath: {data_type: CSV}} +- {name: Schema, type: GCPPath, description: GCS json schema file path.} # type: {GCSPath: {data_type: TFDV schema JSON}} - {name: Run mode, type: String, default: local, description: whether to run the job locally or in Cloud Dataflow.} - {name: GCP project, type: GcpProject, default: '', description: 'The GCP project to run the dataflow job, if running in the `cloud` mode.'} - {name: Slice columns, type: String, description: Comma-separated list of columns on which to slice for analysis.} -- {name: Analysis results dir, type: {GCSPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should be written.} +- {name: Analysis results dir, type: GCPPath, description: GCS or local directory where the analysis results should be written.} # type: {GCSPath: {path_type: Directory}} outputs: -- {name: Analysis results dir, type: {GCSPath: {path_type: Directory}}, description: GCS or local directory where the analysis results should were written.} +- {name: Analysis results dir, type: GCPPath, description: GCS or local directory where the analysis results should were written.} # type: {GCSPath: {path_type: Directory}} implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tfma:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/dataflow/tft/component.yaml b/components/dataflow/tft/component.yaml index 0a7909e8f64..1b9e642b198 100644 --- a/components/dataflow/tft/component.yaml +++ b/components/dataflow/tft/component.yaml @@ -1,15 +1,15 @@ name: Transform using TF on Dataflow description: Runs TensorFlow Transform on Google Cloud Dataflow inputs: - - {name: Training data file pattern, type: {GCSPath: {data_type: CSV}}, description: 'GCS path of train file patterns.'} #Also supports local CSV - - {name: Evaluation data file pattern, type: {GCSPath: {data_type: CSV}}, description: 'GCS path of eval file patterns.'} #Also supports local CSV - - {name: Schema, type: {GCSPath: {data_type: JSON}}, description: 'GCS json schema file path.'} + - {name: Training data file pattern, type: GCPPath, description: 'GCS path of train file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}} + - {name: Evaluation data file pattern, type: GCPPath, description: 'GCS path of eval file patterns.'} #Also supports local CSV # type: {GCSPath: {data_type: CSV}} + - {name: Schema, type: GCPPath, description: 'GCS json schema file path.'} # type: {GCSPath: {data_type: JSON}} - {name: GCP project, type: GcpProject, description: 'The GCP project to run the dataflow job.'} - {name: Run mode, type: String, default: local, description: 'Whether to run the job locally or in Cloud Dataflow. Valid values are "local" and "cloud".' } - - {name: Preprocessing module, type: {GCSPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} - - {name: Transformed data dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory'} #Also supports local paths + - {name: Preprocessing module, type: GCPPath, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} # type: {GCSPath: {data_type: Python}} + - {name: Transformed data dir, type: GCPPath, description: 'GCS or local directory'} #Also supports local paths # type: {GCSPath: {path_type: Directory}} outputs: - - {name: Transformed data dir, type: {GCSPath: {path_type: Directory}}} + - {name: Transformed data dir, type: GCPPath} # type: {GCSPath: {path_type: Directory}} implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/kubeflow/deployer/component.yaml b/components/kubeflow/deployer/component.yaml index 34d6a94f4fd..95cce10936f 100644 --- a/components/kubeflow/deployer/component.yaml +++ b/components/kubeflow/deployer/component.yaml @@ -1,7 +1,7 @@ name: Kubeflow - Serve TF model description: Serve TensorFlow model using Kubeflow TF-serving inputs: - - {name: Model dir, type: {GCSPath: {path_type: Directory}}, description: 'Path of GCS directory containing exported Tensorflow model.'} + - {name: Model dir, type: GCPPath, description: 'Path of GCS directory containing exported Tensorflow model.'} # type: {GCSPath: {path_type: Directory}} - {name: Cluster name, type: String, default: '', description: 'Kubernetes cluster name where the TS-serving service should be deployed. Uses the current cluster by default.'} - {name: Namespace, type: String, default: 'kubeflow', description: 'Kubernetes namespace where the TS-serving service should be deployed.'} - {name: Server name, type: String, default: 'model-server', description: 'TF-serving server name to use when deploying.'} diff --git a/components/kubeflow/dnntrainer/component.yaml b/components/kubeflow/dnntrainer/component.yaml index 714f3944874..fd8a77e8b6d 100644 --- a/components/kubeflow/dnntrainer/component.yaml +++ b/components/kubeflow/dnntrainer/component.yaml @@ -1,18 +1,18 @@ name: Train FC DNN using TF description: Trains fully-connected neural network using Tensorflow inputs: - - {name: Transformed data dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path containing tf-transformed training and eval data.'} - - {name: Schema, type: {GCSPath: {data_type: JSON}}, description: 'GCS json schema file path.'} + - {name: Transformed data dir, type: GCPPath, description: 'GCS path containing tf-transformed training and eval data.'} # type: {GCSPath: {path_type: Directory}} + - {name: Schema, type: GCPPath, description: 'GCS json schema file path.'} # type: {GCSPath: {data_type: JSON}} - {name: Learning rate, type: Float, default: '0.1', description: 'Learning rate for training.'} - {name: Optimizer, type: String, default: 'Adagrad', description: 'Optimizer for training. Valid values are: Adam, SGD, Adagrad. If not provided, tf.estimator default will be used.'} - {name: Hidden layer size, type: String, default: '100', description: 'Comma-separated hidden layer sizes. For example "200,100,50".'} - {name: Steps, type: Integer, description: 'Maximum number of training steps to perform. If unspecified, will honor epochs.'} #- {name: Epochs, type: Integer, default: '', description: 'Maximum number of training data epochs on which to train. If both "steps" and "epochs" are specified, the training job will run for "steps" or "epochs", whichever occurs first.'} - {name: Target, type: String, description: 'Name of the column for prediction target.'} - - {name: Preprocessing module, type: {GCSPath: {data_type: Python}}, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} - - {name: Training output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} + - {name: Preprocessing module, type: GCPPath, default: '', description: 'GCS path to a python file defining "preprocess" and "get_feature_columns" functions.'} # type: {GCSPath: {data_type: Python}} + - {name: Training output dir, type: GCPPath, description: 'GCS or local directory.'} # type: {GCSPath: {path_type: Directory}} outputs: - - {name: Training output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS or local directory.'} + - {name: Training output dir, type: GCPPath, description: 'GCS or local directory.'} # type: {GCSPath: {path_type: Directory}} implementation: container: image: gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:2c2445df83fa879387a200747cc20f72a7ee9727 diff --git a/components/local/confusion_matrix/component.yaml b/components/local/confusion_matrix/component.yaml index 68d54a8df34..623fbf08204 100644 --- a/components/local/confusion_matrix/component.yaml +++ b/components/local/confusion_matrix/component.yaml @@ -1,8 +1,8 @@ name: Confusion matrix description: Calculates confusion matrix inputs: - - {name: Predictions, type: {GCSPath: {data_type: CSV}}, description: 'GCS path of prediction file pattern.'} - - {name: Output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} + - {name: Predictions, type: GCPPath, description: 'GCS path of prediction file pattern.'} # type: {GCSPath: {data_type: CSV}} + - {name: Output dir, type: GCPPath, description: 'GCS path of the output directory.'} # type: {GCSPath: {path_type: Directory}} #outputs: # - {name: UI metadata, type: UI metadata} # - {name: Metrics, type: Metrics} diff --git a/components/local/roc/component.yaml b/components/local/roc/component.yaml index 5bcec836a26..cd6b65a70bd 100644 --- a/components/local/roc/component.yaml +++ b/components/local/roc/component.yaml @@ -1,11 +1,11 @@ name: ROC curve description: Calculates Receiver Operating Characteristic curve. See https://en.wikipedia.org/wiki/Receiver_operating_characteristic inputs: - - {name: Predictions dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path of prediction file pattern.'} #TODO: Replace dir data + schema files + - {name: Predictions dir, type: GCPPath, description: 'GCS path of prediction file pattern.'} #TODO: Replace dir data + schema files # type: {GCSPath: {path_type: Directory}} - {name: True class, type: String, default: 'true', description: 'The true class label for the sample. Default is "true".'} - {name: True score column, type: String, default: 'true', description: 'The name of the column for positive probability.'} - {name: Target lambda, type: String, default: '', description: 'Text of Python lambda function which returns boolean value indicating whether the classification result is correct.\nFor example, "lambda x: x[''a''] and x[''b'']". If missing, input must have a "target" column.'} - - {name: Output dir, type: {GCSPath: {path_type: Directory}}, description: 'GCS path of the output directory.'} #TODO: Replace dir with single file + - {name: Output dir, type: GCPPath, description: 'GCS path of the output directory.'} #TODO: Replace dir with single file # type: {GCSPath: {path_type: Directory}} #outputs: # - {name: UI metadata, type: UI metadata} # - {name: Metrics, type: Metrics} From a3fba176d72c289feb7cd1cda4b8ab08abcdcf85 Mon Sep 17 00:00:00 2001 From: Alexey Volkov Date: Wed, 3 Apr 2019 18:33:02 -0700 Subject: [PATCH 13/13] Removed the usage of the ComponentStore Now the samples are invalid until they're merged to master, but Hongye asked for that. --- .../kubeflow-training-classification.py | 14 +++++--------- samples/xgboost-spark/xgboost-training-cm.py | 11 +++-------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/samples/kubeflow-tf/kubeflow-training-classification.py b/samples/kubeflow-tf/kubeflow-training-classification.py index 36e2a67be5b..56423088175 100755 --- a/samples/kubeflow-tf/kubeflow-training-classification.py +++ b/samples/kubeflow-tf/kubeflow-training-classification.py @@ -16,16 +16,12 @@ import kfp.dsl as dsl import kfp.gcp as gcp -from kfp.components import ComponentStore +from kfp import components -cs = ComponentStore() -cs.url_search_prefixes.append('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/') -cs.url_search_prefixes.append('https://raw.githubusercontent.com/Ark-kun/pipelines/Added-component-definitions-to-our-components/components/') - -dataflow_tf_transform_op = cs.load_component('dataflow/tft') -kubeflow_tf_training_op = cs.load_component('kubeflow/dnntrainer') -dataflow_tf_predict_op = cs.load_component('dataflow/predict') -confusion_matrix_op = cs.load_component('local/confusion_matrix') +dataflow_tf_transform_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/dataflow/tft/component.yaml') +kubeflow_tf_training_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/kubeflow/dnntrainer/component.yaml') +dataflow_tf_predict_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/dataflow/predict/component.yaml') +confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/local/confusion_matrix/component.yaml') @dsl.pipeline( name='Pipeline TFJob', diff --git a/samples/xgboost-spark/xgboost-training-cm.py b/samples/xgboost-spark/xgboost-training-cm.py index 69021c38dbe..05ec4c07850 100755 --- a/samples/xgboost-spark/xgboost-training-cm.py +++ b/samples/xgboost-spark/xgboost-training-cm.py @@ -17,15 +17,10 @@ import kfp.dsl as dsl import kfp.gcp as gcp -from kfp.components import ComponentStore - -cs = ComponentStore() -cs.url_search_prefixes.append('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/') -cs.url_search_prefixes.append('https://raw.githubusercontent.com/Ark-kun/pipelines/Added-component-definitions-to-our-components/components/') - -confusion_matrix_op = cs.load_component('local/confusion_matrix') -roc_op = cs.load_component('local/roc') +from kfp import components +confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/local/confusion_matrix/component.yaml') +roc_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/master/components/local/roc/component.yaml') # ================================================================ # The following classes should be provided by components provider.