From e80b2ec99b679112453bef322603d12819f3732c Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Tue, 11 Feb 2020 09:13:10 -0800 Subject: [PATCH 1/8] Removing unnecessary parameters. --- .../xgboost_training_cm.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index 665c36eacc3..8121095a612 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -22,7 +22,7 @@ import subprocess diagnose_me_op = components.load_component_from_url( - 'https://raw.githubusercontent.com/kubeflow/pipelines/df450617af6e385da8c436628afafb1c76ca6c79/components/diagnostics/diagnose_me/component.yaml') + 'https://raw.githubusercontent.com/kubeflow/pipelines/d0ef0c8dc44a97fb35a7915d334432c6303ef26c/components/diagnostics/diagnose_me/component.yaml') confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml') @@ -207,19 +207,19 @@ def dataproc_predict_op( def xgb_train_pipeline( output='gs://{{kfp-default-bucket}}', project='{{kfp-project-id}}', - cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER, region='us-central1', - train_data='gs://ml-pipeline-playground/sfpd/train.csv', - eval_data='gs://ml-pipeline-playground/sfpd/eval.csv', - schema='gs://ml-pipeline-playground/sfpd/schema.json', - target='resolution', - execution_mode='HALT_ON_ERROR', - required_apis='stackdriver.googleapis.com, storage-api.googleapis.com, bigquery.googleapis.com, dataflow.googleapis.com, dataproc.googleapis.com', + diagnostic_mode='HALT_ON_ERROR', rounds=200, workers=2, - true_label='ACTION', ): output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data' + train_data='gs://ml-pipeline-playground/sfpd/train.csv' + eval_data='gs://ml-pipeline-playground/sfpd/eval.csv' + schema='gs://ml-pipeline-playground/sfpd/schema.json' + true_label='ACTION' + target='resolution' + required_apis='storage-api.googleapis.com, dataproc.googleapis.com' + cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER # Current GCP pyspark/spark op do not provide outputs as return values, instead, # we need to use strings to pass the uri around. @@ -231,7 +231,7 @@ def xgb_train_pipeline( _diagnose_me_op = diagnose_me_op( bucket=output, - execution_mode=execution_mode, + execution_mode=diagnostic_mode, project_id=project, target_apis=required_apis) From 456aed7fc02cb52ca2931505be29659bb54826ed Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Tue, 11 Feb 2020 09:56:34 -0800 Subject: [PATCH 2/8] changing default workers to 1 and rounds to 5 --- samples/core/xgboost_training_cm/xgboost_training_cm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index 8121095a612..5cbfc12ea72 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -209,8 +209,8 @@ def xgb_train_pipeline( project='{{kfp-project-id}}', region='us-central1', diagnostic_mode='HALT_ON_ERROR', - rounds=200, - workers=2, + rounds=5, + workers=1, ): output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data' train_data='gs://ml-pipeline-playground/sfpd/train.csv' From 0b2375ba0ccc298d44ea954dcf834cf57b822154 Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Thu, 13 Feb 2020 11:07:54 -0800 Subject: [PATCH 3/8] removing region --- samples/core/xgboost_training_cm/README.md | 19 +++++++++++++------ .../xgboost_training_cm.py | 2 +- .../configs/xgboost_training_cm.config.yaml | 3 --- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/samples/core/xgboost_training_cm/README.md b/samples/core/xgboost_training_cm/README.md index 7cb6e3dbf34..208bab6e23c 100644 --- a/samples/core/xgboost_training_cm/README.md +++ b/samples/core/xgboost_training_cm/README.md @@ -2,7 +2,7 @@ The `xgboost_training_cm.py` pipeline creates XGBoost models on structured data in CSV format. Both classification and regression are supported. -The pipeline starts by creating an Google DataProc cluster, and then running analysis, transformation, distributed training and +The pipeline starts by creating a Google DataProc cluster, and then running analysis, transformation, distributed training and prediction in the created cluster. Then a single node confusion-matrix and ROC aggregator is used (for classification case) to provide the confusion matrix data, and ROC data to the front end, respectively. @@ -28,11 +28,18 @@ Open the Kubeflow pipelines UI. Create a new pipeline, and then upload the compi ## Run -Most arguments come with default values. Only `output` and `project` need to be filled always. - -* `output` is a Google Storage path which holds -pipeline run results. Note that each pipeline run will create a unique directory under `output` so it will not override previous results. -* `project` is a GCP project. +All arguments come with default values. This pipeline is preloaded as a Demo pipeline in Pipeline UI. You can run the pipeline without any changes. + +## Modifying the pipeline +To do additional exploration you may change some of the parameters, or pipeline input that is currently specified in the pipeline definition. + +* `output` is a Google Storage path which holds pipeline run results. +Note that each pipeline run will create a unique directory under `output` so it will not override previous results. +* `workers` is nubmer of worker notes used for this training. +* `rounds` is the number of XGBoost training iterations. Set the value to 200 to get a reasonable trained model. +* `train_data` points to a CSV file that contains the training data. For a sample see 'gs://ml-pipeline-playground/sfpd/train.csv'. +* `eval_data` points to a CSV file that contains the training data. For a sample see 'gs://ml-pipeline-playground/sfpd/eval.csv'. +* `schema` points to a schema file for train and eval datasets. For a sample see 'gs://ml-pipeline-playground/sfpd/schema.json'. ## Components source diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index 5cbfc12ea72..f05cb5a1ba3 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -207,12 +207,12 @@ def dataproc_predict_op( def xgb_train_pipeline( output='gs://{{kfp-default-bucket}}', project='{{kfp-project-id}}', - region='us-central1', diagnostic_mode='HALT_ON_ERROR', rounds=5, workers=1, ): output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data' + region='us-central1' train_data='gs://ml-pipeline-playground/sfpd/train.csv' eval_data='gs://ml-pipeline-playground/sfpd/eval.csv' schema='gs://ml-pipeline-playground/sfpd/schema.json' diff --git a/test/sample-test/configs/xgboost_training_cm.config.yaml b/test/sample-test/configs/xgboost_training_cm.config.yaml index 49dac8b3600..0704a826e36 100644 --- a/test/sample-test/configs/xgboost_training_cm.config.yaml +++ b/test/sample-test/configs/xgboost_training_cm.config.yaml @@ -16,9 +16,6 @@ test_name: xgboost_training_cm arguments: output: project: ml-pipeline-test - train_data: gs://ml-pipeline-dataset/sample-test/sfpd/train_20.csv - eval_data: gs://ml-pipeline-dataset/sample-test/sfpd/eval_5.csv - schema: gs://ml-pipeline-dataset/sample-test/sfpd/schema.json rounds: 5 workers: 2 test_timeout: 1800 # xgboost needs extra time. From 11079abea40de4873aaf2794d092b1b9f646df6e Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Thu, 13 Feb 2020 16:13:09 -0800 Subject: [PATCH 4/8] Adding quota check --- samples/core/xgboost_training_cm/xgboost_training_cm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index f05cb5a1ba3..bdb01f11782 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -22,7 +22,7 @@ import subprocess diagnose_me_op = components.load_component_from_url( - 'https://raw.githubusercontent.com/kubeflow/pipelines/d0ef0c8dc44a97fb35a7915d334432c6303ef26c/components/diagnostics/diagnose_me/component.yaml') + 'https://raw.githubusercontent.com/numerology/pipelines/1611ec7ac09f69fc6d382de8702045ee3fad55ef/components/diagnostics/diagnose_me/component.yaml') confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml') @@ -213,6 +213,7 @@ def xgb_train_pipeline( ): output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data' region='us-central1' + quota_check=[{'region':region,'metric':'CPUS','quota_needed':1.0}] train_data='gs://ml-pipeline-playground/sfpd/train.csv' eval_data='gs://ml-pipeline-playground/sfpd/eval.csv' schema='gs://ml-pipeline-playground/sfpd/schema.json' @@ -220,6 +221,7 @@ def xgb_train_pipeline( target='resolution' required_apis='storage-api.googleapis.com, dataproc.googleapis.com' cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER + diagnostic_quota = # Current GCP pyspark/spark op do not provide outputs as return values, instead, # we need to use strings to pass the uri around. @@ -233,7 +235,8 @@ def xgb_train_pipeline( bucket=output, execution_mode=diagnostic_mode, project_id=project, - target_apis=required_apis) + target_apis=required_apis, + quota_check=quota_check) with dsl.ExitHandler(exit_op=dataproc_delete_cluster_op( project_id=project, From 73288b0087dd8086810d3f2f4d151bb97295588a Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Thu, 13 Feb 2020 16:50:50 -0800 Subject: [PATCH 5/8] correcting typo --- samples/core/xgboost_training_cm/xgboost_training_cm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index bdb01f11782..28c1066370e 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -221,7 +221,6 @@ def xgb_train_pipeline( target='resolution' required_apis='storage-api.googleapis.com, dataproc.googleapis.com' cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER - diagnostic_quota = # Current GCP pyspark/spark op do not provide outputs as return values, instead, # we need to use strings to pass the uri around. From a646a0ee021bd17c579850ee8edaa5c4a237d2e7 Mon Sep 17 00:00:00 2001 From: Sina Chavoshi Date: Thu, 13 Feb 2020 18:02:31 -0800 Subject: [PATCH 6/8] updating the diagnose_me component --- samples/core/xgboost_training_cm/xgboost_training_cm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index 28c1066370e..1c4ab061a0d 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -22,7 +22,7 @@ import subprocess diagnose_me_op = components.load_component_from_url( - 'https://raw.githubusercontent.com/numerology/pipelines/1611ec7ac09f69fc6d382de8702045ee3fad55ef/components/diagnostics/diagnose_me/component.yaml') + 'https://raw.githubusercontent.com/kubeflow/pipelines/9b0e9efb382c499cef2750d71cb7b97cbd65cd07/components/diagnostics/diagnose_me/component.yaml') confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml') From d3510e8585b55d0b13ebae64f7f5817a83c0496a Mon Sep 17 00:00:00 2001 From: sina chavoshi Date: Fri, 14 Feb 2020 09:17:11 -0800 Subject: [PATCH 7/8] Removing fail on error for diagnostic step --- test/sample-test/configs/xgboost_training_cm.config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/test/sample-test/configs/xgboost_training_cm.config.yaml b/test/sample-test/configs/xgboost_training_cm.config.yaml index 0704a826e36..44e72c80750 100644 --- a/test/sample-test/configs/xgboost_training_cm.config.yaml +++ b/test/sample-test/configs/xgboost_training_cm.config.yaml @@ -18,4 +18,5 @@ arguments: project: ml-pipeline-test rounds: 5 workers: 2 + diagnostic_mode: False test_timeout: 1800 # xgboost needs extra time. From ec45a0c2780534463846868f4d887e106da68e72 Mon Sep 17 00:00:00 2001 From: sina chavoshi Date: Fri, 14 Feb 2020 09:29:54 -0800 Subject: [PATCH 8/8] Update samples/core/xgboost_training_cm/xgboost_training_cm.py Co-Authored-By: Jiaxiao Zheng --- samples/core/xgboost_training_cm/xgboost_training_cm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index 1c4ab061a0d..8705c806f5b 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -22,7 +22,7 @@ import subprocess diagnose_me_op = components.load_component_from_url( - 'https://raw.githubusercontent.com/kubeflow/pipelines/9b0e9efb382c499cef2750d71cb7b97cbd65cd07/components/diagnostics/diagnose_me/component.yaml') + 'https://raw.githubusercontent.com/kubeflow/pipelines/566dddfdfc0a6a725b6e50ea85e73d8d5578bbb9/components/diagnostics/diagnose_me/component.yaml') confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml')