From 5ab0fe22d8c318b39ac1cf7713214ee6000b8d84 Mon Sep 17 00:00:00 2001 From: arithmetic1728 <58957152+arithmetic1728@users.noreply.github.com> Date: Fri, 7 Aug 2020 17:35:44 -0700 Subject: [PATCH] chore: move samples from python-docs-sample (#66) * Add XMPP Sample * Add Dataproc Sample * Add more region tags * Minor dataproc fixes * Fix Dataproc e2e for Python 3 * Update reqs * updating requirements [(#358)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/358) Change-Id: I6177a17fad021e26ed76679d9db34848c17b62a8 * Update Reqs * Wrong arg description * Auto-update dependencies. [(#456)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/456) * Auto-update dependencies. [(#459)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/459) * Fix import order lint errors Change-Id: Ieaf7237fc6f925daec46a07d2e81a452b841198a * bump Change-Id: I02e7767d13ba267ee9fc72c5b68a57013bb8b8d3 * Auto-update dependencies. [(#486)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/486) * Auto-update dependencies. [(#540)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/540) * Auto-update dependencies. [(#542)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/542) * Move to google-cloud [(#544)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/544) * Auto-update dependencies. [(#584)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/584) * Auto-update dependencies. [(#629)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/629) * Update samples to support latest Google Cloud Python [(#656)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/656) * Update README.md [(#691)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/691) * Auto-update dependencies. [(#715)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/715) * Auto-update dependencies. [(#735)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/735) * Auto-update dependencies. * Fix language OCR sample * Remove unused import * Auto-update dependencies. [(#790)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/790) * Remove usage of GoogleCredentials [(#810)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/810) * Fix a typo [(#813)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/813) * Remove cloud config fixture [(#887)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/887) * Remove cloud config fixture * Fix client secrets * Fix bigtable instance * Fix reference to our testing tools * Auto-update dependencies. [(#914)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/914) * Auto-update dependencies. * xfail the error reporting test * Fix lint * Auto-update dependencies. [(#922)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/922) * Auto-update dependencies. * Fix pubsub iam samples * Auto-update dependencies. [(#1005)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1005) * Auto-update dependencies. * Fix bigtable lint * Fix IOT iam interaction * Auto-update dependencies. [(#1011)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1011) * Properly forwarding the "region" parameter provided as an input argument. [(#1029)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1029) * Auto-update dependencies. [(#1055)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1055) * Auto-update dependencies. * Explicitly use latest bigtable client Change-Id: Id71e9e768f020730e4ca9514a0d7ebaa794e7d9e * Revert language update for now Change-Id: I8867f154e9a5aae00d0047c9caf880e5e8f50c53 * Remove pdb. smh Change-Id: I5ff905fadc026eebbcd45512d4e76e003e3b2b43 * Fix region handling and allow to use an existing cluster. [(#1053)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1053) * Auto-update dependencies. [(#1094)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1094) * Auto-update dependencies. * Relax assertions in the ocr_nl sample Change-Id: I6d37e5846a8d6dd52429cb30d501f448c52cbba1 * Drop unused logging apiary samples Change-Id: I545718283773cb729a5e0def8a76ebfa40829d51 * Auto-update dependencies. [(#1133)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1133) * Auto-update dependencies. * Fix missing http library Change-Id: I99faa600f2f3f1f50f57694fc9835d7f35bda250 * Auto-update dependencies. [(#1186)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1186) * Auto-update dependencies. [(#1199)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1199) * Auto-update dependencies. * Fix iot lint Change-Id: I6289e093bdb35e38f9e9bfc3fbc3df3660f9a67e * Fixed Failed Kokoro Test (Dataproc) [(#1203)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1203) * Fixed Failed Kokoro Test (Dataproc) * Fixed Lint Error * Update dataproc_e2e_test.py * Update dataproc_e2e_test.py * Fixing More Lint Errors * Fixed b/65407087 * Revert "Merge branch 'master' of https://github.com/michaelawyu/python-docs-samples" This reverts commit 1614c7d3ef33630a8ab095792b27fc25fd91f0ad, reversing changes made to cd1dbfd25997a154a8a85cc754cc2a85b18a63c4. * Revert "Fixed b/65407087" This reverts commit cd1dbfd25997a154a8a85cc754cc2a85b18a63c4. * Fixed Lint Error * Fixed Lint Error * Auto-update dependencies. [(#1208)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1208) * Dataproc GCS sample plus doc touchups [(#1151)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1151) * Auto-update dependencies. [(#1217)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1217) * Auto-update dependencies. [(#1239)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1239) * Added "Open in Cloud Shell" buttons to README files [(#1254)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1254) * Auto-update dependencies. [(#1282)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1282) * Auto-update dependencies. * Fix storage acl sample Change-Id: I413bea899fdde4c4859e4070a9da25845b81f7cf * Auto-update dependencies. [(#1309)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1309) * Auto-update dependencies. [(#1320)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1320) * Auto-update dependencies. [(#1355)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1355) * Auto-update dependencies. [(#1359)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1359) * Auto-update dependencies. * update Dataproc region tags to standard format [(#1826)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1826) * Update submit_job_to_cluster.py [(#1708)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1708) switch region to new 'global' region and remove unnecessary function. * Auto-update dependencies. [(#1846)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1846) ACK, merging. * Need separate install for google-cloud-storage [(#1863)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1863) * Revert "Update dataproc/submit_job_to_cluster.py" [(#1864)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1864) * Revert "Remove test configs for non-testing directories [(#1855)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1855)" This reverts commit 73a73321579337312e8ba85c34fe9c37b42b7f6e. * Revert "Auto-update dependencies. [(#1846)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1846)" This reverts commit 3adc94f4d0c14453153968c3851fae100e2c5e44. * Revert "Tweak slack sample [(#1847)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1847)" This reverts commit a48c010481c166968d9f1bd58106054c5d1c58f9. * Revert "Non-client library example of constructing a Signed URL [(#1837)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1837)" This reverts commit fc3284d995a8a35c473a207e80490fad265782af. * Revert "GCF samples: handle {empty JSON, GET} requests + remove commas [(#1832)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1832)" This reverts commit 6928491ed3d52b0bec694e6b30257f08caac5f2b. * Revert "Correct the maintenance event types [(#1830)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1830)" This reverts commit c22840fd23586349b7b665d851dea046a94ba7c7. * Revert "Fix GCF region tags [(#1827)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1827)" This reverts commit 0fbfef27d35cea23ad0e20fd2c9df3e8a4a046cb. * Revert "Updated to Flask 1.0 [(#1819)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1819)" This reverts commit d52ccf99503311bba2cec2881e8cb0f9b5a6f2bf. * Revert "Fix deprecation warning [(#1801)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1801)" This reverts commit 981737e85f60eca5cc337f172249deddca9b291b. * Revert "Update submit_job_to_cluster.py [(#1708)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1708)" This reverts commit df1f2b22547b7ca86bbdb791ad930003a815a677. * Create python-api-walkthrough.md [(#1966)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1966) * Create python-api-walkthrough.md This Google Cloud Shell walkthrough is linked to Cloud Dataproc documentation to be published at: https://cloud.google.com/dataproc/docs/tutorials/python-library-example * Update python-api-walkthrough.md * Update list_clusters.py [(#1887)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1887) * Auto-update dependencies. [(#1980)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/1980) * Auto-update dependencies. * Update requirements.txt * Update requirements.txt * Update Dataproc samples. [(#2158)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2158) * Update requirements.txt * Update python-api-walkthrough.md * Update submit_job_to_cluster.py * Update list_clusters.py * Update python-api-walkthrough.md [(#2172)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2172) * Adds updates including compute [(#2436)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2436) * Adds updates including compute * Python 2 compat pytest * Fixing weird \r\n issue from GH merge * Put asset tests back in * Re-add pod operator test * Hack parameter for k8s pod operator * feat: adding samples for dataproc - create cluster [(#2536)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2536) * adding sample for cluster create * small fix * Add create cluster samples * Fixed copyright, added 'dataproc' to region tag and changed imports from 'dataproc' to 'dataproc_v1' * Fix copyright in create_cluster.py * Auto-update dependencies. [(#2005)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2005) * Auto-update dependencies. * Revert update of appengine/flexible/datastore. * revert update of appengine/flexible/scipy * revert update of bigquery/bqml * revert update of bigquery/cloud-client * revert update of bigquery/datalab-migration * revert update of bigtable/quickstart * revert update of compute/api * revert update of container_registry/container_analysis * revert update of dataflow/run_template * revert update of datastore/cloud-ndb * revert update of dialogflow/cloud-client * revert update of dlp * revert update of functions/imagemagick * revert update of functions/ocr/app * revert update of healthcare/api-client/fhir * revert update of iam/api-client * revert update of iot/api-client/gcs_file_to_device * revert update of iot/api-client/mqtt_example * revert update of language/automl * revert update of run/image-processing * revert update of vision/automl * revert update testing/requirements.txt * revert update of vision/cloud-client/detect * revert update of vision/cloud-client/product_search * revert update of jobs/v2/api_client * revert update of jobs/v3/api_client * revert update of opencensus * revert update of translate/cloud-client * revert update to speech/cloud-client Co-authored-by: Kurtis Van Gent <31518063+kurtisvg@users.noreply.github.com> Co-authored-by: Doug Mahugh * feat: dataproc quickstart sample added and create_cluster updated [(#2629)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2629) * Adding quickstart sample * Added new quickstart sample and updated create_cluster sample * Fix to create_cluster.py * deleted dataproc quickstart files not under dataproc/quickstart/ * Added quickstart test * Linting and formatting fixes * Revert "Linting and formatting fixes" This reverts commit c5afcbcdf9deccbb7a21ddd82ae0fc305e79c008. * Added bucket cleanup to quickstart test * Changes to samples and tests * Linting fixes * Removed todos in favor of clearer docstring * Fixed lint error Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * Update Python Cloud Shell walkthrough script [(#2733)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2733) Cloud Shell walkthrough scripts no longer support enabling APIs. APIs must be enabled by linking to the console. Updated product name: "Cloud Dataproc" -> "Dataproc". * fix: added cli functionality to dataproc quickstart example [(#2734)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2734) * Added CLI functionality to quickstart * Fixed Dataproc quickstart test to properly clean up GCS bucket [(#3001)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3001) * splitting up #2651 part 1/3 - dataproc + endpoints [(#3025)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3025) * splitting up #2651 * fix typos * chore(deps): update dependency google-auth to v1.11.2 [(#2724)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2724) Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * chore(deps): update dependency google-cloud-storage to v1.26.0 [(#3046)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3046) * chore(deps): update dependency google-cloud-storage to v1.26.0 * chore(deps): specify dependencies by python version * chore: up other deps to try to remove errors Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> Co-authored-by: Leah Cole * chore(deps): update dependency google-cloud-dataproc to v0.7.0 [(#3083)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3083) * feat: added dataproc workflows samples [(#3056)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3056) * Added workflows sample * chore(deps): update dependency grpcio to v1.27.2 [(#3173)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3173) This PR contains the following updates: | Package | Update | Change | |---|---|---| | [grpcio](https://grpc.io) | minor | `==1.25.0` -> `==1.27.2` | | [grpcio](https://grpc.io) | minor | `==1.23.0` -> `==1.27.2` | | [grpcio](https://grpc.io) | minor | `==1.26.0` -> `==1.27.2` | | [grpcio](https://grpc.io) | patch | `==1.27.1` -> `==1.27.2` | --- ### Renovate configuration :date: **Schedule**: At any time (no schedule defined). :vertical_traffic_light: **Automerge**: Disabled by config. Please merge this manually once you are satisfied. :recycle: **Rebasing**: Never, or you tick the rebase/retry checkbox. :no_bell: **Ignore**: Close this PR and you won't be reminded about these updates again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#GoogleCloudPlatform/python-docs-samples). * Simplify noxfile setup. [(#2806)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2806) * chore(deps): update dependency requests to v2.23.0 * Simplify noxfile and add version control. * Configure appengine/standard to only test Python 2.7. * Update Kokokro configs to match noxfile. * Add requirements-test to each folder. * Remove Py2 versions from everything execept appengine/standard. * Remove conftest.py. * Remove appengine/standard/conftest.py * Remove 'no-sucess-flaky-report' from pytest.ini. * Add GAE SDK back to appengine/standard tests. * Fix typo. * Roll pytest to python 2 version. * Add a bunch of testing requirements. * Remove typo. * Add appengine lib directory back in. * Add some additional requirements. * Fix issue with flake8 args. * Even more requirements. * Readd appengine conftest.py. * Add a few more requirements. * Even more Appengine requirements. * Add webtest for appengine/standard/mailgun. * Add some additional requirements. * Add workaround for issue with mailjet-rest. * Add responses for appengine/standard/mailjet. Co-authored-by: Renovate Bot * fix: add mains to samples [(#3284)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3284) Added mains to two samples: create_cluster and instantiate_inline_workflow_templates. Fixed their associated tests to accommodate this. Removed subprocess from quickstart/quickstart_test.py to fix [2873](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/2873) fixes #2873 * Update dependency grpcio to v1.28.1 [(#3276)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3276) Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * Update dependency google-auth to v1.14.0 [(#3148)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3148) Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> * chore(deps): update dependency google-auth to v1.14.1 [(#3464)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3464) This PR contains the following updates: | Package | Update | Change | |---|---|---| | [google-auth](https://togithub.com/googleapis/google-auth-library-python) | patch | `==1.14.0` -> `==1.14.1` | | [google-auth](https://togithub.com/googleapis/google-auth-library-python) | minor | `==1.11.2` -> `==1.14.1` | --- ### Release Notes
googleapis/google-auth-library-python ### [`v1.14.1`](https://togithub.com/googleapis/google-auth-library-python/blob/master/CHANGELOG.md#​1141-httpswwwgithubcomgoogleapisgoogle-auth-library-pythoncomparev1140v1141-2020-04-21) [Compare Source](https://togithub.com/googleapis/google-auth-library-python/compare/v1.14.0...v1.14.1)
--- ### Renovate configuration :date: **Schedule**: At any time (no schedule defined). :vertical_traffic_light: **Automerge**: Disabled by config. Please merge this manually once you are satisfied. :recycle: **Rebasing**: Never, or you tick the rebase/retry checkbox. :no_bell: **Ignore**: Close this PR and you won't be reminded about these updates again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#GoogleCloudPlatform/python-docs-samples). * chore(deps): update dependency google-cloud-storage to v1.28.0 [(#3260)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3260) Co-authored-by: Takashi Matsuo * chore(deps): update dependency google-auth to v1.14.2 [(#3724)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3724) This PR contains the following updates: | Package | Update | Change | |---|---|---| | [google-auth](https://togithub.com/googleapis/google-auth-library-python) | patch | `==1.14.1` -> `==1.14.2` | --- ### Release Notes
googleapis/google-auth-library-python ### [`v1.14.2`](https://togithub.com/googleapis/google-auth-library-python/blob/master/CHANGELOG.md#​1142-httpswwwgithubcomgoogleapisgoogle-auth-library-pythoncomparev1141v1142-2020-05-07) [Compare Source](https://togithub.com/googleapis/google-auth-library-python/compare/v1.14.1...v1.14.2)
--- ### Renovate configuration :date: **Schedule**: At any time (no schedule defined). :vertical_traffic_light: **Automerge**: Disabled by config. Please merge this manually once you are satisfied. :recycle: **Rebasing**: Never, or you tick the rebase/retry checkbox. :no_bell: **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#GoogleCloudPlatform/python-docs-samples). * chore: some lint fixes [(#3743)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3743) * chore(deps): update dependency google-auth to v1.14.3 [(#3728)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3728) This PR contains the following updates: | Package | Update | Change | |---|---|---| | [google-auth](https://togithub.com/googleapis/google-auth-library-python) | patch | `==1.14.2` -> `==1.14.3` | --- ### Release Notes
googleapis/google-auth-library-python ### [`v1.14.3`](https://togithub.com/googleapis/google-auth-library-python/blob/master/CHANGELOG.md#​1143-httpswwwgithubcomgoogleapisgoogle-auth-library-pythoncomparev1142v1143-2020-05-11) [Compare Source](https://togithub.com/googleapis/google-auth-library-python/compare/v1.14.2...v1.14.3)
--- ### Renovate configuration :date: **Schedule**: At any time (no schedule defined). :vertical_traffic_light: **Automerge**: Disabled by config. Please merge this manually once you are satisfied. :recycle: **Rebasing**: Never, or you tick the rebase/retry checkbox. :no_bell: **Ignore**: Close this PR and you won't be reminded about this update again. --- - [x] If you want to rebase/retry this PR, check this box --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#GoogleCloudPlatform/python-docs-samples). * chore(deps): update dependency grpcio to v1.29.0 [(#3786)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3786) * chore(deps): update dependency google-cloud-storage to v1.28.1 [(#3785)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3785) * chore(deps): update dependency google-cloud-storage to v1.28.1 * [asset] testing: use uuid instead of time Co-authored-by: Takashi Matsuo * update google-auth to 1.15.0 part 3 [(#3816)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3816) * Update dependency google-cloud-dataproc to v0.8.0 [(#3837)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3837) * chore(deps): update dependency google-auth to v1.16.0 [(#3903)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3903) * update google-auth part 3 [(#3963)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/3963) * chore(deps): update dependency google-cloud-dataproc to v0.8.1 [(#4015)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4015) This PR contains the following updates: | Package | Update | Change | |---|---|---| | [google-cloud-dataproc](https://togithub.com/googleapis/python-dataproc) | patch | `==0.8.0` -> `==0.8.1` | --- ### Release Notes
googleapis/python-dataproc ### [`v0.8.1`](https://togithub.com/googleapis/python-dataproc/blob/master/CHANGELOG.md#​081-httpswwwgithubcomgoogleapispython-dataproccomparev080v081-2020-06-05) [Compare Source](https://togithub.com/googleapis/python-dataproc/compare/v0.8.0...v0.8.1)
--- ### Renovate configuration :date: **Schedule**: At any time (no schedule defined). :vertical_traffic_light: **Automerge**: Disabled by config. Please merge this manually once you are satisfied. :recycle: **Rebasing**: Never, or you tick the rebase/retry checkbox. :no_bell: **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#GoogleCloudPlatform/python-docs-samples). * Replace GCLOUD_PROJECT with GOOGLE_CLOUD_PROJECT. [(#4022)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4022) * Update dependency google-auth to v1.17.0 [(#4058)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4058) * chore(deps): update dependency google-auth to v1.17.1 [(#4073)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4073) * Update dependency google-auth to v1.17.2 [(#4083)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4083) * Update dependency google-auth to v1.18.0 [(#4125)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4125) * Update dependency google-cloud-dataproc to v1 [(#4109)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4109) Co-authored-by: Takashi Matsuo * chore(deps): update dependency google-cloud-storage to v1.29.0 [(#4040)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4040) * chore(deps): update dependency grpcio to v1.30.0 [(#4143)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4143) Co-authored-by: Takashi Matsuo * Update dependency google-auth-httplib2 to v0.0.4 [(#4255)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4255) Co-authored-by: Takashi Matsuo * chore(deps): update dependency pytest to v5.4.3 [(#4279)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4279) * chore(deps): update dependency pytest to v5.4.3 * specify pytest for python 2 in appengine Co-authored-by: Leah Cole * chore(deps): update dependency google-auth to v1.19.0 [(#4293)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4293) * chore(deps): update dependency google-cloud-dataproc to v1.0.1 [(#4309)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4309) This PR contains the following updates: | Package | Update | Change | |---|---|---| | [google-cloud-dataproc](https://togithub.com/googleapis/python-dataproc) | patch | `==1.0.0` -> `==1.0.1` | --- ### Release Notes
googleapis/python-dataproc ### [`v1.0.1`](https://togithub.com/googleapis/python-dataproc/blob/master/CHANGELOG.md#​101-httpswwwgithubcomgoogleapispython-dataproccomparev100v101-2020-07-16) [Compare Source](https://togithub.com/googleapis/python-dataproc/compare/v1.0.0...v1.0.1)
--- ### Renovate configuration :date: **Schedule**: At any time (no schedule defined). :vertical_traffic_light: **Automerge**: Disabled by config. Please merge this manually once you are satisfied. :recycle: **Rebasing**: Never, or you tick the rebase/retry checkbox. :no_bell: **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#GoogleCloudPlatform/python-docs-samples). * chore(deps): update dependency google-auth to v1.19.1 [(#4304)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4304) * chore(deps): update dependency google-auth to v1.19.2 [(#4321)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4321) This PR contains the following updates: | Package | Update | Change | |---|---|---| | [google-auth](https://togithub.com/googleapis/google-auth-library-python) | patch | `==1.19.1` -> `==1.19.2` | --- ### Release Notes
googleapis/google-auth-library-python ### [`v1.19.2`](https://togithub.com/googleapis/google-auth-library-python/blob/master/CHANGELOG.md#​1192-httpswwwgithubcomgoogleapisgoogle-auth-library-pythoncomparev1191v1192-2020-07-17) [Compare Source](https://togithub.com/googleapis/google-auth-library-python/compare/v1.19.1...v1.19.2)
--- ### Renovate configuration :date: **Schedule**: At any time (no schedule defined). :vertical_traffic_light: **Automerge**: Disabled by config. Please merge this manually once you are satisfied. :recycle: **Rebasing**: Never, or you tick the rebase/retry checkbox. :no_bell: **Ignore**: Close this PR and you won't be reminded about this update again. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR has been generated by [WhiteSource Renovate](https://renovate.whitesourcesoftware.com). View repository job log [here](https://app.renovatebot.com/dashboard#GoogleCloudPlatform/python-docs-samples). * Update dependency google-auth to v1.20.0 [(#4387)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4387) * Update dependency pytest to v6 [(#4390)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4390) * Update dependency grpcio to v1.31.0 [(#4438)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4438) * chore(deps): update dependency google-auth to v1.20.1 [(#4452)](https://github.com/GoogleCloudPlatform/python-docs-samples/issues/4452) * chore: update templates Co-authored-by: Bill Prin Co-authored-by: Bill Prin Co-authored-by: Jon Wayne Parrott Co-authored-by: Eran Kampf Co-authored-by: DPE bot Co-authored-by: aman-ebay Co-authored-by: Martial Hue Co-authored-by: Gioia Ballin Co-authored-by: michaelawyu Co-authored-by: michaelawyu Co-authored-by: Alix Hamilton Co-authored-by: James Winegar Co-authored-by: Charles Engelke Co-authored-by: Gus Class Co-authored-by: Brad Miro Co-authored-by: Kurtis Van Gent <31518063+kurtisvg@users.noreply.github.com> Co-authored-by: Doug Mahugh Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com> Co-authored-by: WhiteSource Renovate Co-authored-by: Leah Cole Co-authored-by: Takashi Matsuo --- dataproc/snippets/README.md | 84 +++++ dataproc/snippets/create_cluster.py | 77 +++++ dataproc/snippets/create_cluster_test.py | 47 +++ dataproc/snippets/dataproc_e2e_donttest.py | 32 ++ .../instantiate_inline_workflow_template.py | 107 +++++++ ...stantiate_inline_workflow_template_test.py | 31 ++ dataproc/snippets/list_clusters.py | 63 ++++ dataproc/snippets/noxfile.py | 224 ++++++++++++++ dataproc/snippets/pyspark_sort.py | 28 ++ dataproc/snippets/pyspark_sort_gcs.py | 30 ++ dataproc/snippets/python-api-walkthrough.md | 170 +++++++++++ dataproc/snippets/quickstart/quickstart.py | 159 ++++++++++ .../snippets/quickstart/quickstart_test.py | 71 +++++ dataproc/snippets/requirements-test.txt | 1 + dataproc/snippets/requirements.txt | 6 + dataproc/snippets/single_job_workflow.py | 209 +++++++++++++ dataproc/snippets/submit_job_to_cluster.py | 288 ++++++++++++++++++ 17 files changed, 1627 insertions(+) create mode 100644 dataproc/snippets/README.md create mode 100644 dataproc/snippets/create_cluster.py create mode 100644 dataproc/snippets/create_cluster_test.py create mode 100644 dataproc/snippets/dataproc_e2e_donttest.py create mode 100644 dataproc/snippets/instantiate_inline_workflow_template.py create mode 100644 dataproc/snippets/instantiate_inline_workflow_template_test.py create mode 100644 dataproc/snippets/list_clusters.py create mode 100644 dataproc/snippets/noxfile.py create mode 100644 dataproc/snippets/pyspark_sort.py create mode 100644 dataproc/snippets/pyspark_sort_gcs.py create mode 100644 dataproc/snippets/python-api-walkthrough.md create mode 100644 dataproc/snippets/quickstart/quickstart.py create mode 100644 dataproc/snippets/quickstart/quickstart_test.py create mode 100644 dataproc/snippets/requirements-test.txt create mode 100644 dataproc/snippets/requirements.txt create mode 100644 dataproc/snippets/single_job_workflow.py create mode 100644 dataproc/snippets/submit_job_to_cluster.py diff --git a/dataproc/snippets/README.md b/dataproc/snippets/README.md new file mode 100644 index 000000000000..98622be7dc16 --- /dev/null +++ b/dataproc/snippets/README.md @@ -0,0 +1,84 @@ +# Cloud Dataproc API Examples + +[![Open in Cloud Shell][shell_img]][shell_link] + +[shell_img]: http://gstatic.com/cloudssh/images/open-btn.png +[shell_link]: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataproc/README.md + +Sample command-line programs for interacting with the Cloud Dataproc API. + +See [the tutorial on the using the Dataproc API with the Python client +library](https://cloud.google.com/dataproc/docs/tutorials/python-library-example) +for information on a walkthrough you can run to try out the Cloud Dataproc API sample code. + +Note that while this sample demonstrates interacting with Dataproc via the API, the functionality demonstrated here could also be accomplished using the Cloud Console or the gcloud CLI. + +`list_clusters.py` is a simple command-line program to demonstrate connecting to the Cloud Dataproc API and listing the clusters in a region. + +`submit_job_to_cluster.py` demonstrates how to create a cluster, submit the +`pyspark_sort.py` job, download the output from Google Cloud Storage, and output the result. + +`single_job_workflow.py` uses the Cloud Dataproc InstantiateInlineWorkflowTemplate API to create an ephemeral cluster, run a job, then delete the cluster with one API request. + +`pyspark_sort.py_gcs` is the same as `pyspark_sort.py` but demonstrates + reading from a GCS bucket. + +## Prerequisites to run locally: + +* [pip](https://pypi.python.org/pypi/pip) + +Go to the [Google Cloud Console](https://console.cloud.google.com). + +Under API Manager, search for the Google Cloud Dataproc API and enable it. + +## Set Up Your Local Dev Environment + +To install, run the following commands. If you want to use [virtualenv](https://virtualenv.readthedocs.org/en/latest/) +(recommended), run the commands within a virtualenv. + + * pip install -r requirements.txt + +## Authentication + +Please see the [Google cloud authentication guide](https://cloud.google.com/docs/authentication/). +The recommended approach to running these samples is a Service Account with a JSON key. + +## Environment Variables + +Set the following environment variables: + + GOOGLE_CLOUD_PROJECT=your-project-id + REGION=us-central1 # or your region + CLUSTER_NAME=waprin-spark7 + ZONE=us-central1-b + +## Running the samples + +To run list_clusters.py: + + python list_clusters.py $GOOGLE_CLOUD_PROJECT --region=$REGION + +`submit_job_to_cluster.py` can create the Dataproc cluster or use an existing cluster. To create a cluster before running the code, you can use the [Cloud Console](console.cloud.google.com) or run: + + gcloud dataproc clusters create your-cluster-name + +To run submit_job_to_cluster.py, first create a GCS bucket (used by Cloud Dataproc to stage files) from the Cloud Console or with gsutil: + + gsutil mb gs:// + +Next, set the following environment variables: + + BUCKET=your-staging-bucket + CLUSTER=your-cluster-name + +Then, if you want to use an existing cluster, run: + + python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET + +Alternatively, to create a new cluster, which will be deleted at the end of the job, run: + + python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET --create_new_cluster + +The script will setup a cluster, upload the PySpark file, submit the job, print the result, then, if it created the cluster, delete the cluster. + +Optionally, you can add the `--pyspark_file` argument to change from the default `pyspark_sort.py` included in this script to a new script. diff --git a/dataproc/snippets/create_cluster.py b/dataproc/snippets/create_cluster.py new file mode 100644 index 000000000000..b4d63d2e13f5 --- /dev/null +++ b/dataproc/snippets/create_cluster.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This sample walks a user through creating a Cloud Dataproc cluster using +# the Python client library. +# +# This script can be run on its own: +# python create_cluster.py ${PROJECT_ID} ${REGION} ${CLUSTER_NAME} + + +import sys + +# [START dataproc_create_cluster] +from google.cloud import dataproc_v1 as dataproc + + +def create_cluster(project_id, region, cluster_name): + """This sample walks a user through creating a Cloud Dataproc cluster + using the Python client library. + + Args: + project_id (string): Project to use for creating resources. + region (string): Region where the resources should live. + cluster_name (string): Name to use for creating a cluster. + """ + + # Create a client with the endpoint set to the desired cluster region. + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': f'{region}-dataproc.googleapis.com:443', + }) + + # Create the cluster config. + cluster = { + 'project_id': project_id, + 'cluster_name': cluster_name, + 'config': { + 'master_config': { + 'num_instances': 1, + 'machine_type_uri': 'n1-standard-1' + }, + 'worker_config': { + 'num_instances': 2, + 'machine_type_uri': 'n1-standard-1' + } + } + } + + # Create the cluster. + operation = cluster_client.create_cluster(project_id, region, cluster) + result = operation.result() + + # Output a success message. + print(f'Cluster created successfully: {result.cluster_name}') + # [END dataproc_create_cluster] + + +if __name__ == "__main__": + if len(sys.argv) < 4: + sys.exit('python create_cluster.py project_id region cluster_name') + + project_id = sys.argv[1] + region = sys.argv[2] + cluster_name = sys.argv[3] + create_cluster(project_id, region, cluster_name) diff --git a/dataproc/snippets/create_cluster_test.py b/dataproc/snippets/create_cluster_test.py new file mode 100644 index 000000000000..6b1d6806100e --- /dev/null +++ b/dataproc/snippets/create_cluster_test.py @@ -0,0 +1,47 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +from google.cloud import dataproc_v1 as dataproc +import pytest + +import create_cluster + + +PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT'] +REGION = 'us-central1' +CLUSTER_NAME = 'py-cc-test-{}'.format(str(uuid.uuid4())) + + +@pytest.fixture(autouse=True) +def teardown(): + yield + + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': f'{REGION}-dataproc.googleapis.com:443' + }) + # Client library function + operation = cluster_client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME) + # Wait for cluster to delete + operation.result() + + +def test_cluster_create(capsys): + # Wrapper function for client library function + create_cluster.create_cluster(PROJECT_ID, REGION, CLUSTER_NAME) + + out, _ = capsys.readouterr() + assert CLUSTER_NAME in out diff --git a/dataproc/snippets/dataproc_e2e_donttest.py b/dataproc/snippets/dataproc_e2e_donttest.py new file mode 100644 index 000000000000..44cc03bfd428 --- /dev/null +++ b/dataproc/snippets/dataproc_e2e_donttest.py @@ -0,0 +1,32 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Integration tests for Dataproc samples. + +Creates a Dataproc cluster, uploads a pyspark file to Google Cloud Storage, +submits a job to Dataproc that runs the pyspark file, then downloads +the output logs from Cloud Storage and verifies the expected output.""" + +import os + +import submit_job_to_cluster + +PROJECT = os.environ['GOOGLE_CLOUD_PROJECT'] +BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] +CLUSTER_NAME = 'testcluster3' +ZONE = 'us-central1-b' + + +def test_e2e(): + output = submit_job_to_cluster.main( + PROJECT, ZONE, CLUSTER_NAME, BUCKET) + assert b"['Hello,', 'dog', 'elephant', 'panther', 'world!']" in output diff --git a/dataproc/snippets/instantiate_inline_workflow_template.py b/dataproc/snippets/instantiate_inline_workflow_template.py new file mode 100644 index 000000000000..f9358376f9f9 --- /dev/null +++ b/dataproc/snippets/instantiate_inline_workflow_template.py @@ -0,0 +1,107 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This sample walks a user through instantiating an inline +# workflow for Cloud Dataproc using the Python client library. +# +# This script can be run on its own: +# python instantiate_inline_workflow_template.py ${PROJECT_ID} ${REGION} + + +import sys + +# [START dataproc_instantiate_inline_workflow_template] +from google.cloud import dataproc_v1 as dataproc + + +def instantiate_inline_workflow_template(project_id, region): + """This sample walks a user through submitting a workflow + for a Cloud Dataproc using the Python client library. + + Args: + project_id (string): Project to use for running the workflow. + region (string): Region where the workflow resources should live. + """ + + # Create a client with the endpoint set to the desired region. + workflow_template_client = dataproc.WorkflowTemplateServiceClient( + client_options={ + 'api_endpoint': f'{region}-dataproc.googleapis.com:443' + } + ) + + parent = workflow_template_client.region_path(project_id, region) + + template = { + 'jobs': [ + { + 'hadoop_job': { + 'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/' + 'hadoop-mapreduce-examples.jar', + 'args': [ + 'teragen', + '1000', + 'hdfs:///gen/' + ] + }, + 'step_id': 'teragen' + }, + { + 'hadoop_job': { + 'main_jar_file_uri': 'file:///usr/lib/hadoop-mapreduce/' + 'hadoop-mapreduce-examples.jar', + 'args': [ + 'terasort', + 'hdfs:///gen/', + 'hdfs:///sort/' + ] + }, + 'step_id': 'terasort', + 'prerequisite_step_ids': [ + 'teragen' + ] + }], + 'placement': { + 'managed_cluster': { + 'cluster_name': 'my-managed-cluster', + 'config': { + 'gce_cluster_config': { + # Leave 'zone_uri' empty for 'Auto Zone Placement' + # 'zone_uri': '' + 'zone_uri': 'us-central1-a' + } + } + } + } + } + + # Submit the request to instantiate the workflow from an inline template. + operation = workflow_template_client.instantiate_inline_workflow_template( + parent, template + ) + operation.result() + + # Output a success message. + print('Workflow ran successfully.') + # [END dataproc_instantiate_inline_workflow_template] + + +if __name__ == "__main__": + if len(sys.argv) < 3: + sys.exit('python instantiate_inline_workflow_template.py ' + + 'project_id region') + + project_id = sys.argv[1] + region = sys.argv[2] + instantiate_inline_workflow_template(project_id, region) diff --git a/dataproc/snippets/instantiate_inline_workflow_template_test.py b/dataproc/snippets/instantiate_inline_workflow_template_test.py new file mode 100644 index 000000000000..22673e4ee086 --- /dev/null +++ b/dataproc/snippets/instantiate_inline_workflow_template_test.py @@ -0,0 +1,31 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import instantiate_inline_workflow_template + + +PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT'] +REGION = 'us-central1' + + +def test_workflows(capsys): + # Wrapper function for client library function + instantiate_inline_workflow_template.instantiate_inline_workflow_template( + PROJECT_ID, REGION + ) + + out, _ = capsys.readouterr() + assert "successfully" in out diff --git a/dataproc/snippets/list_clusters.py b/dataproc/snippets/list_clusters.py new file mode 100644 index 000000000000..1639c4134685 --- /dev/null +++ b/dataproc/snippets/list_clusters.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Sample command-line program to list Cloud Dataproc clusters in a region. + +Example usage: +python list_clusters.py --project_id=my-project-id --region=global + +""" +import argparse + +from google.cloud import dataproc_v1 +from google.cloud.dataproc_v1.gapic.transports import ( + cluster_controller_grpc_transport) + + +# [START dataproc_list_clusters] +def list_clusters(dataproc, project, region): + """List the details of clusters in the region.""" + for cluster in dataproc.list_clusters(project, region): + print(('{} - {}'.format(cluster.cluster_name, + cluster.status.State.Name( + cluster.status.state)))) +# [END dataproc_list_clusters] + + +def main(project_id, region): + + if region == 'global': + # Use the default gRPC global endpoints. + dataproc_cluster_client = dataproc_v1.ClusterControllerClient() + else: + # Use a regional gRPC endpoint. See: + # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints + client_transport = ( + cluster_controller_grpc_transport.ClusterControllerGrpcTransport( + address='{}-dataproc.googleapis.com:443'.format(region))) + dataproc_cluster_client = dataproc_v1.ClusterControllerClient( + client_transport) + + list_clusters(dataproc_cluster_client, project_id, region) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=( + argparse.RawDescriptionHelpFormatter)) + parser.add_argument( + '--project_id', help='Project ID to access.', required=True) + parser.add_argument( + '--region', help='Region of clusters to list.', required=True) + + args = parser.parse_args() + main(args.project_id, args.region) diff --git a/dataproc/snippets/noxfile.py b/dataproc/snippets/noxfile.py new file mode 100644 index 000000000000..ba55d7ce53ca --- /dev/null +++ b/dataproc/snippets/noxfile.py @@ -0,0 +1,224 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +from pathlib import Path +import sys + +import nox + + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +# Copy `noxfile_config.py` to your directory and modify it instead. + + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + 'ignored_versions': ["2.7"], + + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + 'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT', + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + 'envs': {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append('.') + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars(): + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG['gcloud_project_env'] + # This should error out if not set. + ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] + + # Apply user supplied envs. + ret.update(TEST_CONFIG['envs']) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to tested samples. +ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) +# +# Style Checks +# + + +def _determine_local_import_names(start_dir): + """Determines all import names that should be considered "local". + + This is used when running the linter to insure that import order is + properly checked. + """ + file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] + return [ + basename + for basename, extension in file_ext_pairs + if extension == ".py" + or os.path.isdir(os.path.join(start_dir, basename)) + and basename not in ("__pycache__") + ] + + +# Linting with flake8. +# +# We ignore the following rules: +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--import-order-style=google", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session): + session.install("flake8", "flake8-import-order") + + local_names = _determine_local_import_names(".") + args = FLAKE8_COMMON_ARGS + [ + "--application-import-names", + ",".join(local_names), + "." + ] + session.run("flake8", *args) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests(session, post_install=None): + """Runs py.test for a particular project.""" + if os.path.exists("requirements.txt"): + session.install("-r", "requirements.txt") + + if os.path.exists("requirements-test.txt"): + session.install("-r", "requirements-test.txt") + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars() + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session): + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip("SKIPPED: {} tests are disabled for this sample.".format( + session.python + )) + + +# +# Readmegen +# + + +def _get_repo_root(): + """ Returns the root folder of the project. """ + # Get root of this repository. Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session, path): + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) diff --git a/dataproc/snippets/pyspark_sort.py b/dataproc/snippets/pyspark_sort.py new file mode 100644 index 000000000000..0ce2350ad02b --- /dev/null +++ b/dataproc/snippets/pyspark_sort.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Sample pyspark script to be uploaded to Cloud Storage and run on +Cloud Dataproc. + +Note this file is not intended to be run directly, but run inside a PySpark +environment. +""" + +# [START dataproc_pyspark_sort] +import pyspark + +sc = pyspark.SparkContext() +rdd = sc.parallelize(['Hello,', 'world!', 'dog', 'elephant', 'panther']) +words = sorted(rdd.collect()) +print(words) +# [END dataproc_pyspark_sort] diff --git a/dataproc/snippets/pyspark_sort_gcs.py b/dataproc/snippets/pyspark_sort_gcs.py new file mode 100644 index 000000000000..f1961c378d36 --- /dev/null +++ b/dataproc/snippets/pyspark_sort_gcs.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Sample pyspark script to be uploaded to Cloud Storage and run on +Cloud Dataproc. + +Note this file is not intended to be run directly, but run inside a PySpark +environment. + +This file demonstrates how to read from a GCS bucket. See README.md for more +information. +""" + +# [START dataproc_pyspark_sort_gcs] +import pyspark + +sc = pyspark.SparkContext() +rdd = sc.textFile('gs://path-to-your-GCS-file') +print(sorted(rdd.collect())) +# [END dataproc_pyspark_sort_gcs] diff --git a/dataproc/snippets/python-api-walkthrough.md b/dataproc/snippets/python-api-walkthrough.md new file mode 100644 index 000000000000..1a8d436f7202 --- /dev/null +++ b/dataproc/snippets/python-api-walkthrough.md @@ -0,0 +1,170 @@ +# Use the Python Client Library to call Dataproc APIs + +Estimated completion time: + +## Overview + +This [Cloud Shell](https://cloud.google.com/shell/docs/) walkthrough leads you +through the steps to use the +[Google Cloud Client Libraries for Python](https://googleapis.github.io/google-cloud-python/latest/dataproc/index.html) +to programmatically interact with [Dataproc](https://cloud.google.com/dataproc/docs/). + +As you follow this walkthrough, you run Python code that calls +[Dataproc gRPC APIs](https://cloud.google.com/dataproc/docs/reference/rpc/) +to: + +* create a Dataproc cluster +* submit a small PySpark word sort job to run on the cluster +* get job status +* tear down the cluster after job completion + +## Using the walkthrough + +The `submit_job_to_cluster.py file` used in this walkthrough is opened in the +Cloud Shell editor when you launch the walkthrough. You can view +the code as your follow the walkthrough steps. + +**For more information**: See [Dataproc→Use the Python Client Library](https://cloud.google.com/dataproc/docs/tutorials/python-library-example) for +an explanation of how the code works. + +**To reload this walkthrough:** Run the following command from the +`~/python-docs-samples/dataproc` directory in Cloud Shell: + + cloudshell launch-tutorial python-api-walkthrough.md + +**To copy and run commands**: Click the "Paste in Cloud Shell" button + () + on the side of a code box, then press `Enter` to run the command. + +## Prerequisites (1) + +1. Create or select a Google Cloud Platform project to use for this tutorial. + * + +1. Click the link below to enable the Dataproc, Compute Engine, and Cloud Storage APIs + in a separate GCP console tab in your browser. + + **Note:** After you select your project and enable the APIs, return to this tutorial by clicking + on the **Cloud Shell** tab in your browser. + + * [Enable APIs](https://console.cloud.google.com/flows/enableapi?apiid=dataproc,compute_component,storage-component.googleapis.com&redirect=https://console.cloud.google.com) + +## Prerequisites (2) + +1. This walkthrough uploads a PySpark file (`pyspark_sort.py`) to a + [Cloud Storage bucket](https://cloud.google.com/storage/docs/key-terms#buckets) in + your project. + * You can use the [Cloud Storage browser page](https://console.cloud.google.com/storage/browser) + in Google Cloud Platform Console to view existing buckets in your project. + +     **OR** + + * To create a new bucket, run the following command. Your bucket name must be unique. + ```bash + gsutil mb -p {{project-id}} gs://your-bucket-name + ``` + +1. Set environment variables. + + * Set the name of your bucket. + ```bash + BUCKET=your-bucket-name + ``` + +## Prerequisites (3) + +1. Set up a Python + [virtual environment](https://virtualenv.readthedocs.org/en/latest/) + in Cloud Shell. + + * Create the virtual environment. + ```bash + virtualenv ENV + ``` + * Activate the virtual environment. + ```bash + source ENV/bin/activate + ``` + +1. Install library dependencies in Cloud Shell. + ```bash + pip install -r requirements.txt + ``` + +## Create a cluster and submit a job + +1. Set a name for your new cluster. + ```bash + CLUSTER=new-cluster-name + ``` + +1. Set a [zone](https://cloud.google.com/compute/docs/regions-zones/#available) + where your new cluster will be located. You can change the + "us-central1-a" zone that is pre-set in the following command. + ```bash + ZONE=us-central1-a + ``` + +1. Run `submit_job.py` with the `--create_new_cluster` flag + to create a new cluster and submit the `pyspark_sort.py` job + to the cluster. + + ```bash + python submit_job_to_cluster.py \ + --project_id={{project-id}} \ + --cluster_name=$CLUSTER \ + --zone=$ZONE \ + --gcs_bucket=$BUCKET \ + --create_new_cluster + ``` + +## Job Output + +Job output in Cloud Shell shows cluster creation, job submission, + job completion, and then tear-down of the cluster. + + ... + Creating cluster... + Cluster created. + Uploading pyspark file to Cloud Storage. + new-cluster-name - RUNNING + Submitted job ID ... + Waiting for job to finish... + Job finished. + Downloading output file + ..... + ['Hello,', 'dog', 'elephant', 'panther', 'world!'] + ... + Tearing down cluster + ``` +## Congratulations on Completing the Walkthrough! + + +--- + +### Next Steps: + +* **View job details from the Console.** View job details by selecting the + PySpark job from the Dataproc += + [Jobs page](https://console.cloud.google.com/dataproc/jobs) + in the Google Cloud Platform Console. + +* **Delete resources used in the walkthrough.** + The `submit_job_to_cluster.py` job deletes the cluster that it created for this + walkthrough. + + If you created a bucket to use for this walkthrough, + you can run the following command to delete the + Cloud Storage bucket (the bucket must be empty). + ```bash + gsutil rb gs://$BUCKET + ``` + You can run the following command to delete the bucket **and all + objects within it. Note: the deleted objects cannot be recovered.** + ```bash + gsutil rm -r gs://$BUCKET + ``` + +* **For more information.** See the [Dataproc documentation](https://cloud.google.com/dataproc/docs/) + for API reference and product feature information. diff --git a/dataproc/snippets/quickstart/quickstart.py b/dataproc/snippets/quickstart/quickstart.py new file mode 100644 index 000000000000..4159e2815202 --- /dev/null +++ b/dataproc/snippets/quickstart/quickstart.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python + +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# [START dataproc_quickstart] +""" +This quickstart sample walks a user through creating a Cloud Dataproc +cluster, submitting a PySpark job from Google Cloud Storage to the +cluster, reading the output of the job and deleting the cluster, all +using the Python client library. + +Usage: + python quickstart.py --project_id --region \ + --cluster_name --job_file_path +""" + +import argparse +import time + +from google.cloud import dataproc_v1 as dataproc +from google.cloud import storage + + +def quickstart(project_id, region, cluster_name, job_file_path): + # Create the cluster client. + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + }) + + # Create the cluster config. + cluster = { + 'project_id': project_id, + 'cluster_name': cluster_name, + 'config': { + 'master_config': { + 'num_instances': 1, + 'machine_type_uri': 'n1-standard-1' + }, + 'worker_config': { + 'num_instances': 2, + 'machine_type_uri': 'n1-standard-1' + } + } + } + + # Create the cluster. + operation = cluster_client.create_cluster(project_id, region, cluster) + result = operation.result() + + print('Cluster created successfully: {}'.format(result.cluster_name)) + + # Create the job client. + job_client = dataproc.JobControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(region) + }) + + # Create the job config. + job = { + 'placement': { + 'cluster_name': cluster_name + }, + 'pyspark_job': { + 'main_python_file_uri': job_file_path + } + } + + job_response = job_client.submit_job(project_id, region, job) + job_id = job_response.reference.job_id + + print('Submitted job \"{}\".'.format(job_id)) + + # Termimal states for a job. + terminal_states = { + dataproc.types.JobStatus.ERROR, + dataproc.types.JobStatus.CANCELLED, + dataproc.types.JobStatus.DONE + } + + # Create a timeout such that the job gets cancelled if not in a + # terminal state after a fixed period of time. + timeout_seconds = 600 + time_start = time.time() + + # Wait for the job to complete. + while job_response.status.state not in terminal_states: + if time.time() > time_start + timeout_seconds: + job_client.cancel_job(project_id, region, job_id) + print('Job {} timed out after threshold of {} seconds.'.format( + job_id, timeout_seconds)) + + # Poll for job termination once a second. + time.sleep(1) + job_response = job_client.get_job(project_id, region, job_id) + + # Cloud Dataproc job output gets saved to a GCS bucket allocated to it. + cluster_info = cluster_client.get_cluster( + project_id, region, cluster_name) + + storage_client = storage.Client() + bucket = storage_client.get_bucket(cluster_info.config.config_bucket) + output_blob = ( + 'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000' + .format(cluster_info.cluster_uuid, job_id)) + output = bucket.blob(output_blob).download_as_string() + + print('Job {} finished with state {}:\n{}'.format( + job_id, + job_response.status.State.Name(job_response.status.state), + output)) + + # Delete the cluster once the job has terminated. + operation = cluster_client.delete_cluster(project_id, region, cluster_name) + operation.result() + + print('Cluster {} successfully deleted.'.format(cluster_name)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + '--project_id', + type=str, + required=True, + help='Project to use for creating resources.') + parser.add_argument( + '--region', + type=str, + required=True, + help='Region where the resources should live.') + parser.add_argument( + '--cluster_name', + type=str, + required=True, + help='Name to use for creating a cluster.') + parser.add_argument( + '--job_file_path', + type=str, + required=True, + help='Job in GCS to execute against the cluster.') + + args = parser.parse_args() + quickstart(args.project_id, args.region, + args.cluster_name, args.job_file_path) +# [END dataproc_quickstart] diff --git a/dataproc/snippets/quickstart/quickstart_test.py b/dataproc/snippets/quickstart/quickstart_test.py new file mode 100644 index 000000000000..3e17f6fa3e57 --- /dev/null +++ b/dataproc/snippets/quickstart/quickstart_test.py @@ -0,0 +1,71 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +from google.cloud import dataproc_v1 as dataproc +from google.cloud import storage +import pytest + +import quickstart + + +PROJECT_ID = os.environ['GOOGLE_CLOUD_PROJECT'] +REGION = 'us-central1' +CLUSTER_NAME = 'py-qs-test-{}'.format(str(uuid.uuid4())) +STAGING_BUCKET = 'py-dataproc-qs-bucket-{}'.format(str(uuid.uuid4())) +JOB_FILE_NAME = 'sum.py' +JOB_FILE_PATH = 'gs://{}/{}'.format(STAGING_BUCKET, JOB_FILE_NAME) +SORT_CODE = ( + "import pyspark\n" + "sc = pyspark.SparkContext()\n" + "rdd = sc.parallelize((1,2,3,4,5))\n" + "sum = rdd.reduce(lambda x, y: x + y)\n" +) + + +@pytest.fixture(autouse=True) +def setup_teardown(): + storage_client = storage.Client() + bucket = storage_client.create_bucket(STAGING_BUCKET) + blob = bucket.blob(JOB_FILE_NAME) + blob.upload_from_string(SORT_CODE) + + yield + + cluster_client = dataproc.ClusterControllerClient(client_options={ + 'api_endpoint': '{}-dataproc.googleapis.com:443'.format(REGION) + }) + + # The quickstart sample deletes the cluster, but if the test fails + # before cluster deletion occurs, it can be manually deleted here. + clusters = cluster_client.list_clusters(PROJECT_ID, REGION) + + for cluster in clusters: + if cluster.cluster_name == CLUSTER_NAME: + cluster_client.delete_cluster(PROJECT_ID, REGION, CLUSTER_NAME) + + blob.delete() + bucket.delete() + + +def test_quickstart(capsys): + quickstart.quickstart(PROJECT_ID, REGION, CLUSTER_NAME, JOB_FILE_PATH) + out, _ = capsys.readouterr() + + assert 'Cluster created successfully' in out + assert 'Submitted job' in out + assert 'finished with state DONE:' in out + assert 'successfully deleted' in out diff --git a/dataproc/snippets/requirements-test.txt b/dataproc/snippets/requirements-test.txt new file mode 100644 index 000000000000..7e460c8c866e --- /dev/null +++ b/dataproc/snippets/requirements-test.txt @@ -0,0 +1 @@ +pytest==6.0.1 diff --git a/dataproc/snippets/requirements.txt b/dataproc/snippets/requirements.txt new file mode 100644 index 000000000000..ebc1d8813435 --- /dev/null +++ b/dataproc/snippets/requirements.txt @@ -0,0 +1,6 @@ +grpcio==1.31.0 +google-auth==1.20.1 +google-auth-httplib2==0.0.4 +google-cloud==0.34.0 +google-cloud-storage==1.29.0 +google-cloud-dataproc==1.0.1 diff --git a/dataproc/snippets/single_job_workflow.py b/dataproc/snippets/single_job_workflow.py new file mode 100644 index 000000000000..b2754b06c1ec --- /dev/null +++ b/dataproc/snippets/single_job_workflow.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +r"""Sample Cloud Dataproc inline workflow to run a pyspark job on an ephermeral +cluster. +Example Usage to run the inline workflow on a managed cluster: +python single_job_workflow.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ + --cluster_name=$CLUSTER --zone=$ZONE +Example Usage to run the inline workflow on a global region managed cluster: +python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ + --cluster_name=$CLUSTER --zone=$ZONE --global_region +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os + +from google.cloud import dataproc_v1 +from google.cloud import storage +from google.cloud.dataproc_v1.gapic.transports import ( + workflow_template_service_grpc_transport) + + +DEFAULT_FILENAME = "pyspark_sort.py" +waiting_callback = False + + +def get_pyspark_file(pyspark_file=None): + if pyspark_file: + f = open(pyspark_file, "rb") + return f, os.path.basename(pyspark_file) + else: + """Gets the PySpark file from current directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(current_dir, DEFAULT_FILENAME), "rb") + return f, DEFAULT_FILENAME + + +def get_region_from_zone(zone): + try: + region_as_list = zone.split("-")[:-1] + return "-".join(region_as_list) + except (AttributeError, IndexError, ValueError): + raise ValueError("Invalid zone provided, please check your input.") + + +def upload_pyspark_file(project, bucket_name, filename, spark_file): + """Uploads the PySpark file in this directory to the configured input + bucket.""" + print("Uploading pyspark file to Cloud Storage.") + client = storage.Client(project=project) + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(filename) + blob.upload_from_file(spark_file) + + +def run_workflow(dataproc, project, region, zone, bucket_name, filename, + cluster_name): + + parent = "projects/{}/regions/{}".format(project, region) + zone_uri = ("https://www.googleapis.com/compute/v1/projects/{}/zones/{}" + .format(project, zone)) + + workflow_data = { + "placement": { + "managed_cluster": { + "cluster_name": cluster_name, + "config": { + "gce_cluster_config": {"zone_uri": zone_uri}, + "master_config": { + "num_instances": 1, + "machine_type_uri": "n1-standard-1", + }, + "worker_config": { + "num_instances": 2, + "machine_type_uri": "n1-standard-1", + }, + }, + } + }, + "jobs": [ + { + "pyspark_job": { + "main_python_file_uri": "gs://{}/{}".format( + bucket_name, filename) + }, + "step_id": "pyspark-job", + } + ], + } + + workflow = dataproc.instantiate_inline_workflow_template(parent, + workflow_data) + + workflow.add_done_callback(callback) + global waiting_callback + waiting_callback = True + + +def callback(operation_future): + # Reset global when callback returns. + global waiting_callback + waiting_callback = False + + +def wait_for_workflow_end(): + """Wait for cluster creation.""" + print("Waiting for workflow completion ...") + print("Workflow and job progress, and job driver output available from: " + "https://console.cloud.google.com/dataproc/workflows/") + + while True: + if not waiting_callback: + print("Workflow completed.") + break + + +def main( + project_id, + zone, + cluster_name, + bucket_name, + pyspark_file=None, + create_new_cluster=True, + global_region=True, +): + + # [START dataproc_get_workflow_template_client] + if global_region: + region = "global" + # Use the default gRPC global endpoints. + dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient() + else: + region = get_region_from_zone(zone) + # Use a regional gRPC endpoint. See: + # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints + client_transport = (workflow_template_service_grpc_transport + .WorkflowTemplateServiceGrpcTransport( + address="{}-dataproc.googleapis.com:443" + .format(region))) + dataproc_workflow_client = dataproc_v1.WorkflowTemplateServiceClient( + client_transport + ) + # [END dataproc_get_workflow_template_client] + + try: + spark_file, spark_filename = get_pyspark_file(pyspark_file) + upload_pyspark_file(project_id, bucket_name, spark_filename, + spark_file) + + run_workflow( + dataproc_workflow_client, + project_id, + region, + zone, + bucket_name, + spark_filename, + cluster_name + ) + wait_for_workflow_end() + + finally: + spark_file.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=(argparse + .RawDescriptionHelpFormatter)) + parser.add_argument( + "--project_id", help="Project ID you want to access.", required=True + ) + parser.add_argument( + "--zone", help="Zone to create clusters in/connect to", required=True + ) + parser.add_argument( + "--cluster_name", help="Name of the cluster to create/connect to", + required=True + ) + parser.add_argument( + "--gcs_bucket", help="Bucket to upload Pyspark file to", required=True + ) + parser.add_argument( + "--pyspark_file", help="Pyspark filename. Defaults to pyspark_sort.py" + ) + parser.add_argument("--global_region", + action="store_true", + help="If cluster is in the global region") + + args = parser.parse_args() + main( + args.project_id, + args.zone, + args.cluster_name, + args.gcs_bucket, + args.pyspark_file, + ) diff --git a/dataproc/snippets/submit_job_to_cluster.py b/dataproc/snippets/submit_job_to_cluster.py new file mode 100644 index 000000000000..389cbec87aa3 --- /dev/null +++ b/dataproc/snippets/submit_job_to_cluster.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +r"""Sample command-line program to run a pyspark job on a new or existing +cluster. + +Global region clusters are supported with --global_region flag. + +Example Usage to run the pyspark job on a new cluster: +python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ + --create_new_cluster --cluster_name=$CLUSTER --zone=$ZONE + +Example Usage to run the pyspark job on an existing global region cluster: +python submit_job_to_cluster.py --project_id=$PROJECT --gcs_bucket=$BUCKET \ + --global_region --cluster_name=$CLUSTER --zone=$ZONE + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import os + +from google.cloud import dataproc_v1 +from google.cloud import storage +from google.cloud.dataproc_v1.gapic.transports import ( + cluster_controller_grpc_transport) +from google.cloud.dataproc_v1.gapic.transports import ( + job_controller_grpc_transport) + + +DEFAULT_FILENAME = 'pyspark_sort.py' +waiting_callback = False + + +def get_pyspark_file(pyspark_file=None): + if pyspark_file: + f = open(pyspark_file, "rb") + return f, os.path.basename(pyspark_file) + else: + """Gets the PySpark file from current directory.""" + current_dir = os.path.dirname(os.path.abspath(__file__)) + f = open(os.path.join(current_dir, DEFAULT_FILENAME), "rb") + return f, DEFAULT_FILENAME + + +def get_region_from_zone(zone): + try: + region_as_list = zone.split('-')[:-1] + return '-'.join(region_as_list) + except (AttributeError, IndexError, ValueError): + raise ValueError('Invalid zone provided, please check your input.') + + +def upload_pyspark_file(project, bucket_name, filename, spark_file): + """Uploads the PySpark file in this directory to the configured input + bucket.""" + print('Uploading pyspark file to Cloud Storage.') + client = storage.Client(project=project) + bucket = client.get_bucket(bucket_name) + blob = bucket.blob(filename) + blob.upload_from_file(spark_file) + + +def download_output(project, cluster_id, output_bucket, job_id): + """Downloads the output file from Cloud Storage and returns it as a + string.""" + print('Downloading output file.') + client = storage.Client(project=project) + bucket = client.get_bucket(output_bucket) + output_blob = ( + ('google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'. + format(cluster_id, job_id))) + return bucket.blob(output_blob).download_as_string() + + +# [START dataproc_create_cluster] +def create_cluster(dataproc, project, zone, region, cluster_name): + """Create the cluster.""" + print('Creating cluster...') + zone_uri = \ + 'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format( + project, zone) + cluster_data = { + 'project_id': project, + 'cluster_name': cluster_name, + 'config': { + 'gce_cluster_config': { + 'zone_uri': zone_uri + }, + 'master_config': { + 'num_instances': 1, + 'machine_type_uri': 'n1-standard-1' + }, + 'worker_config': { + 'num_instances': 2, + 'machine_type_uri': 'n1-standard-1' + } + } + } + + cluster = dataproc.create_cluster(project, region, cluster_data) + cluster.add_done_callback(callback) + global waiting_callback + waiting_callback = True +# [END dataproc_create_cluster] + + +def callback(operation_future): + # Reset global when callback returns. + global waiting_callback + waiting_callback = False + + +def wait_for_cluster_creation(): + """Wait for cluster creation.""" + print('Waiting for cluster creation...') + + while True: + if not waiting_callback: + print("Cluster created.") + break + + +# [START dataproc_list_clusters_with_detail] +def list_clusters_with_details(dataproc, project, region): + """List the details of clusters in the region.""" + for cluster in dataproc.list_clusters(project, region): + print(('{} - {}'.format(cluster.cluster_name, + cluster.status.State.Name( + cluster.status.state)))) +# [END dataproc_list_clusters_with_detail] + + +def get_cluster_id_by_name(dataproc, project_id, region, cluster_name): + """Helper function to retrieve the ID and output bucket of a cluster by + name.""" + for cluster in dataproc.list_clusters(project_id, region): + if cluster.cluster_name == cluster_name: + return cluster.cluster_uuid, cluster.config.config_bucket + + +# [START dataproc_submit_pyspark_job] +def submit_pyspark_job(dataproc, project, region, cluster_name, bucket_name, + filename): + """Submit the Pyspark job to the cluster (assumes `filename` was uploaded + to `bucket_name.""" + job_details = { + 'placement': { + 'cluster_name': cluster_name + }, + 'pyspark_job': { + 'main_python_file_uri': 'gs://{}/{}'.format(bucket_name, filename) + } + } + + result = dataproc.submit_job( + project_id=project, region=region, job=job_details) + job_id = result.reference.job_id + print('Submitted job ID {}.'.format(job_id)) + return job_id +# [END dataproc_submit_pyspark_job] + + +# [START dataproc_delete] +def delete_cluster(dataproc, project, region, cluster): + """Delete the cluster.""" + print('Tearing down cluster.') + result = dataproc.delete_cluster( + project_id=project, region=region, cluster_name=cluster) + return result +# [END dataproc_delete] + + +# [START dataproc_wait] +def wait_for_job(dataproc, project, region, job_id): + """Wait for job to complete or error out.""" + print('Waiting for job to finish...') + while True: + job = dataproc.get_job(project, region, job_id) + # Handle exceptions + if job.status.State.Name(job.status.state) == 'ERROR': + raise Exception(job.status.details) + elif job.status.State.Name(job.status.state) == 'DONE': + print('Job finished.') + return job +# [END dataproc_wait] + + +def main(project_id, + zone, + cluster_name, + bucket_name, + pyspark_file=None, + create_new_cluster=True, + global_region=True): + + # [START dataproc_get_client] + if global_region: + region = 'global' + # Use the default gRPC global endpoints. + dataproc_cluster_client = dataproc_v1.ClusterControllerClient() + dataproc_job_client = dataproc_v1.JobControllerClient() + else: + region = get_region_from_zone(zone) + # Use a regional gRPC endpoint. See: + # https://cloud.google.com/dataproc/docs/concepts/regional-endpoints + client_transport = ( + cluster_controller_grpc_transport.ClusterControllerGrpcTransport( + address='{}-dataproc.googleapis.com:443'.format(region))) + job_transport = ( + job_controller_grpc_transport.JobControllerGrpcTransport( + address='{}-dataproc.googleapis.com:443'.format(region))) + dataproc_cluster_client = dataproc_v1.ClusterControllerClient( + client_transport) + dataproc_job_client = dataproc_v1.JobControllerClient(job_transport) + # [END dataproc_get_client] + + try: + spark_file, spark_filename = get_pyspark_file(pyspark_file) + if create_new_cluster: + create_cluster(dataproc_cluster_client, project_id, zone, region, + cluster_name) + wait_for_cluster_creation() + upload_pyspark_file(project_id, bucket_name, spark_filename, + spark_file) + + list_clusters_with_details(dataproc_cluster_client, project_id, + region) + + (cluster_id, output_bucket) = ( + get_cluster_id_by_name(dataproc_cluster_client, project_id, + region, cluster_name)) + + # [START dataproc_call_submit_pyspark_job] + job_id = submit_pyspark_job(dataproc_job_client, project_id, region, + cluster_name, bucket_name, spark_filename) + # [END dataproc_call_submit_pyspark_job] + + wait_for_job(dataproc_job_client, project_id, region, job_id) + output = download_output(project_id, cluster_id, output_bucket, job_id) + print('Received job output {}'.format(output)) + return output + finally: + if create_new_cluster: + delete_cluster(dataproc_cluster_client, project_id, region, + cluster_name) + spark_file.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse. + RawDescriptionHelpFormatter) + parser.add_argument( + '--project_id', help='Project ID you want to access.', required=True) + parser.add_argument('--zone', + help='Zone to create clusters in/connect to', + required=True) + parser.add_argument('--cluster_name', + help='Name of the cluster to create/connect to', + required=True) + parser.add_argument('--gcs_bucket', + help='Bucket to upload Pyspark file to', + required=True) + parser.add_argument('--pyspark_file', + help='Pyspark filename. Defaults to pyspark_sort.py') + parser.add_argument('--create_new_cluster', + action='store_true', + help='States if the cluster should be created') + parser.add_argument('--global_region', + action='store_true', + help='If cluster is in the global region') + + args = parser.parse_args() + main(args.project_id, args.zone, args.cluster_name, args.gcs_bucket, + args.pyspark_file, args.create_new_cluster, args.global_region)