From e80b2ec99b679112453bef322603d12819f3732c Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <chavoshi@google.com>
Date: Tue, 11 Feb 2020 09:13:10 -0800
Subject: [PATCH 1/8] Removing unnecessary parameters.

---
 .../xgboost_training_cm.py                    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py
index 665c36eacc3..8121095a612 100644
--- a/samples/core/xgboost_training_cm/xgboost_training_cm.py
+++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py
@@ -22,7 +22,7 @@
 import subprocess
 
 diagnose_me_op = components.load_component_from_url(
-    'https://raw.githubusercontent.com/kubeflow/pipelines/df450617af6e385da8c436628afafb1c76ca6c79/components/diagnostics/diagnose_me/component.yaml')
+    'https://raw.githubusercontent.com/kubeflow/pipelines/d0ef0c8dc44a97fb35a7915d334432c6303ef26c/components/diagnostics/diagnose_me/component.yaml')
 
 confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml')
 
@@ -207,19 +207,19 @@ def dataproc_predict_op(
 def xgb_train_pipeline(
     output='gs://{{kfp-default-bucket}}',
     project='{{kfp-project-id}}',
-    cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER,
     region='us-central1',
-    train_data='gs://ml-pipeline-playground/sfpd/train.csv',
-    eval_data='gs://ml-pipeline-playground/sfpd/eval.csv',
-    schema='gs://ml-pipeline-playground/sfpd/schema.json',
-    target='resolution',
-    execution_mode='HALT_ON_ERROR',
-    required_apis='stackdriver.googleapis.com, storage-api.googleapis.com, bigquery.googleapis.com, dataflow.googleapis.com, dataproc.googleapis.com',
+    diagnostic_mode='HALT_ON_ERROR',
     rounds=200,
     workers=2,
-    true_label='ACTION',
 ):
     output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data'
+    train_data='gs://ml-pipeline-playground/sfpd/train.csv'
+    eval_data='gs://ml-pipeline-playground/sfpd/eval.csv'
+    schema='gs://ml-pipeline-playground/sfpd/schema.json'
+    true_label='ACTION'
+    target='resolution'
+    required_apis='storage-api.googleapis.com, dataproc.googleapis.com'
+    cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER
 
     # Current GCP pyspark/spark op do not provide outputs as return values, instead,
     # we need to use strings to pass the uri around.
@@ -231,7 +231,7 @@ def xgb_train_pipeline(
     
     _diagnose_me_op = diagnose_me_op(
         bucket=output,
-        execution_mode=execution_mode,
+        execution_mode=diagnostic_mode,
         project_id=project, 
         target_apis=required_apis)
     

From 456aed7fc02cb52ca2931505be29659bb54826ed Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <chavoshi@google.com>
Date: Tue, 11 Feb 2020 09:56:34 -0800
Subject: [PATCH 2/8] changing default workers to 1 and rounds to 5

---
 samples/core/xgboost_training_cm/xgboost_training_cm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py
index 8121095a612..5cbfc12ea72 100644
--- a/samples/core/xgboost_training_cm/xgboost_training_cm.py
+++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py
@@ -209,8 +209,8 @@ def xgb_train_pipeline(
     project='{{kfp-project-id}}',
     region='us-central1',
     diagnostic_mode='HALT_ON_ERROR',
-    rounds=200,
-    workers=2,
+    rounds=5,
+    workers=1,
 ):
     output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data'
     train_data='gs://ml-pipeline-playground/sfpd/train.csv'

From 0b2375ba0ccc298d44ea954dcf834cf57b822154 Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <chavoshi@google.com>
Date: Thu, 13 Feb 2020 11:07:54 -0800
Subject: [PATCH 3/8] removing region

---
 samples/core/xgboost_training_cm/README.md    | 19 +++++++++++++------
 .../xgboost_training_cm.py                    |  2 +-
 .../configs/xgboost_training_cm.config.yaml   |  3 ---
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/samples/core/xgboost_training_cm/README.md b/samples/core/xgboost_training_cm/README.md
index 7cb6e3dbf34..208bab6e23c 100644
--- a/samples/core/xgboost_training_cm/README.md
+++ b/samples/core/xgboost_training_cm/README.md
@@ -2,7 +2,7 @@
 
 The `xgboost_training_cm.py` pipeline creates XGBoost models on structured data in CSV format. Both classification and regression are supported.
 
-The pipeline starts by creating an Google DataProc cluster, and then running analysis, transformation, distributed training and 
+The pipeline starts by creating a Google DataProc cluster, and then running analysis, transformation, distributed training and 
 prediction in the created cluster. 
 Then a single node confusion-matrix and ROC aggregator is used (for classification case) to	
 provide the confusion matrix data, and ROC data to the front end, respectively.
@@ -28,11 +28,18 @@ Open the Kubeflow pipelines UI. Create a new pipeline, and then upload the compi
 
 ## Run
 
-Most arguments come with default values. Only `output` and `project` need to be filled always. 
-
-* `output` is a Google Storage path which holds
-pipeline run results. Note that each pipeline run will create a unique directory under `output` so it will not override previous results. 
-* `project` is a GCP project.
+All arguments come with default values. This pipeline is preloaded as a Demo pipeline in Pipeline UI. You can run the pipeline without any changes.
+
+## Modifying the pipeline
+To do additional exploration you may change some of the parameters, or pipeline input that is currently specified in the pipeline definition.  
+ 
+* `output` is a Google Storage path which holds pipeline run results.
+Note that each pipeline run will create a unique directory under `output` so it will not override previous results.
+* `workers` is nubmer of worker notes used for this training. 
+* `rounds` is the number of XGBoost training iterations. Set the value to 200 to get a reasonable trained model.
+* `train_data` points to a CSV file that contains the training data. For a sample see 'gs://ml-pipeline-playground/sfpd/train.csv'.
+* `eval_data` points to a CSV file that contains the training data. For a sample see 'gs://ml-pipeline-playground/sfpd/eval.csv'.
+* `schema` points to a schema file for train and eval datasets. For a sample see 'gs://ml-pipeline-playground/sfpd/schema.json'.
 
 ## Components source
 
diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py
index 5cbfc12ea72..f05cb5a1ba3 100644
--- a/samples/core/xgboost_training_cm/xgboost_training_cm.py
+++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py
@@ -207,12 +207,12 @@ def dataproc_predict_op(
 def xgb_train_pipeline(
     output='gs://{{kfp-default-bucket}}',
     project='{{kfp-project-id}}',
-    region='us-central1',
     diagnostic_mode='HALT_ON_ERROR',
     rounds=5,
     workers=1,
 ):
     output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data'
+    region='us-central1'
     train_data='gs://ml-pipeline-playground/sfpd/train.csv'
     eval_data='gs://ml-pipeline-playground/sfpd/eval.csv'
     schema='gs://ml-pipeline-playground/sfpd/schema.json'
diff --git a/test/sample-test/configs/xgboost_training_cm.config.yaml b/test/sample-test/configs/xgboost_training_cm.config.yaml
index 49dac8b3600..0704a826e36 100644
--- a/test/sample-test/configs/xgboost_training_cm.config.yaml
+++ b/test/sample-test/configs/xgboost_training_cm.config.yaml
@@ -16,9 +16,6 @@ test_name: xgboost_training_cm
 arguments:
   output:
   project: ml-pipeline-test
-  train_data: gs://ml-pipeline-dataset/sample-test/sfpd/train_20.csv
-  eval_data: gs://ml-pipeline-dataset/sample-test/sfpd/eval_5.csv
-  schema: gs://ml-pipeline-dataset/sample-test/sfpd/schema.json
   rounds: 5
   workers: 2
 test_timeout: 1800 # xgboost needs extra time.

From 11079abea40de4873aaf2794d092b1b9f646df6e Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <chavoshi@google.com>
Date: Thu, 13 Feb 2020 16:13:09 -0800
Subject: [PATCH 4/8] Adding quota check

---
 samples/core/xgboost_training_cm/xgboost_training_cm.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py
index f05cb5a1ba3..bdb01f11782 100644
--- a/samples/core/xgboost_training_cm/xgboost_training_cm.py
+++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py
@@ -22,7 +22,7 @@
 import subprocess
 
 diagnose_me_op = components.load_component_from_url(
-    'https://raw.githubusercontent.com/kubeflow/pipelines/d0ef0c8dc44a97fb35a7915d334432c6303ef26c/components/diagnostics/diagnose_me/component.yaml')
+    'https://raw.githubusercontent.com/numerology/pipelines/1611ec7ac09f69fc6d382de8702045ee3fad55ef/components/diagnostics/diagnose_me/component.yaml')
 
 confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml')
 
@@ -213,6 +213,7 @@ def xgb_train_pipeline(
 ):
     output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data'
     region='us-central1'
+    quota_check=[{'region':region,'metric':'CPUS','quota_needed':1.0}]
     train_data='gs://ml-pipeline-playground/sfpd/train.csv'
     eval_data='gs://ml-pipeline-playground/sfpd/eval.csv'
     schema='gs://ml-pipeline-playground/sfpd/schema.json'
@@ -220,6 +221,7 @@ def xgb_train_pipeline(
     target='resolution'
     required_apis='storage-api.googleapis.com, dataproc.googleapis.com'
     cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER
+    diagnostic_quota = 
 
     # Current GCP pyspark/spark op do not provide outputs as return values, instead,
     # we need to use strings to pass the uri around.
@@ -233,7 +235,8 @@ def xgb_train_pipeline(
         bucket=output,
         execution_mode=diagnostic_mode,
         project_id=project, 
-        target_apis=required_apis)
+        target_apis=required_apis,
+        quota_check=quota_check)
     
     with dsl.ExitHandler(exit_op=dataproc_delete_cluster_op(
         project_id=project,

From 73288b0087dd8086810d3f2f4d151bb97295588a Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <chavoshi@google.com>
Date: Thu, 13 Feb 2020 16:50:50 -0800
Subject: [PATCH 5/8] correcting typo

---
 samples/core/xgboost_training_cm/xgboost_training_cm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py
index bdb01f11782..28c1066370e 100644
--- a/samples/core/xgboost_training_cm/xgboost_training_cm.py
+++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py
@@ -221,7 +221,6 @@ def xgb_train_pipeline(
     target='resolution'
     required_apis='storage-api.googleapis.com, dataproc.googleapis.com'
     cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER
-    diagnostic_quota = 
 
     # Current GCP pyspark/spark op do not provide outputs as return values, instead,
     # we need to use strings to pass the uri around.

From a646a0ee021bd17c579850ee8edaa5c4a237d2e7 Mon Sep 17 00:00:00 2001
From: Sina Chavoshi <chavoshi@google.com>
Date: Thu, 13 Feb 2020 18:02:31 -0800
Subject: [PATCH 6/8] updating the diagnose_me component

---
 samples/core/xgboost_training_cm/xgboost_training_cm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py
index 28c1066370e..1c4ab061a0d 100644
--- a/samples/core/xgboost_training_cm/xgboost_training_cm.py
+++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py
@@ -22,7 +22,7 @@
 import subprocess
 
 diagnose_me_op = components.load_component_from_url(
-    'https://raw.githubusercontent.com/numerology/pipelines/1611ec7ac09f69fc6d382de8702045ee3fad55ef/components/diagnostics/diagnose_me/component.yaml')
+    'https://raw.githubusercontent.com/kubeflow/pipelines/9b0e9efb382c499cef2750d71cb7b97cbd65cd07/components/diagnostics/diagnose_me/component.yaml')
 
 confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml')
 

From d3510e8585b55d0b13ebae64f7f5817a83c0496a Mon Sep 17 00:00:00 2001
From: sina chavoshi <sina.chavoshi@gmail.com>
Date: Fri, 14 Feb 2020 09:17:11 -0800
Subject: [PATCH 7/8] Removing fail on error for diagnostic step

---
 test/sample-test/configs/xgboost_training_cm.config.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/sample-test/configs/xgboost_training_cm.config.yaml b/test/sample-test/configs/xgboost_training_cm.config.yaml
index 0704a826e36..44e72c80750 100644
--- a/test/sample-test/configs/xgboost_training_cm.config.yaml
+++ b/test/sample-test/configs/xgboost_training_cm.config.yaml
@@ -18,4 +18,5 @@ arguments:
   project: ml-pipeline-test
   rounds: 5
   workers: 2
+  diagnostic_mode: False
 test_timeout: 1800 # xgboost needs extra time.

From ec45a0c2780534463846868f4d887e106da68e72 Mon Sep 17 00:00:00 2001
From: sina chavoshi <sina.chavoshi@gmail.com>
Date: Fri, 14 Feb 2020 09:29:54 -0800
Subject: [PATCH 8/8] Update
 samples/core/xgboost_training_cm/xgboost_training_cm.py

Co-Authored-By: Jiaxiao Zheng <jxzheng@google.com>
---
 samples/core/xgboost_training_cm/xgboost_training_cm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py
index 1c4ab061a0d..8705c806f5b 100644
--- a/samples/core/xgboost_training_cm/xgboost_training_cm.py
+++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py
@@ -22,7 +22,7 @@
 import subprocess
 
 diagnose_me_op = components.load_component_from_url(
-    'https://raw.githubusercontent.com/kubeflow/pipelines/9b0e9efb382c499cef2750d71cb7b97cbd65cd07/components/diagnostics/diagnose_me/component.yaml')
+    'https://raw.githubusercontent.com/kubeflow/pipelines/566dddfdfc0a6a725b6e50ea85e73d8d5578bbb9/components/diagnostics/diagnose_me/component.yaml')
 
 confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml')