diff --git a/.gitbook.yaml b/.gitbook.yaml index 44f5ace4104..6cdb7e8ac5d 100644 --- a/.gitbook.yaml +++ b/.gitbook.yaml @@ -6,14 +6,14 @@ structure: redirects: how-to/customize-docker-builds/use-code-repositories-to-speed-up-docker-build-times: how-to/customize-docker-builds/how-to-reuse-builds.md - reference/migration-guide/README.md: how-to/manage-the-zenml-server/migration-guide/migration-guide.md - reference/migration-guide/migration-zero-twenty.md: how-to/manage-the-zenml-server/migration-guide/migration-zero-twenty.md - reference/migration-guide/migration-zero-thirty.md: how-to/manage-the-zenml-server/migration-guide/migration-zero-thirty.md - reference/migration-guide/migration-zero-forty.md: how-to/manage-the-zenml-server/migration-guide/migration-zero-forty.md - reference/migration-guide/migration-zero-sixty.md: how-to/manage-the-zenml-server/migration-guide/migration-zero-sixty.md + reference/migration-guide: how-to/manage-the-zenml-server/migration-guide/migration-guide.md + reference/migration-guide/migration-zero-twenty: how-to/manage-the-zenml-server/migration-guide/migration-zero-twenty.md + reference/migration-guide/migration-zero-thirty: how-to/manage-the-zenml-server/migration-guide/migration-zero-thirty.md + reference/migration-guide/migration-zero-forty: how-to/manage-the-zenml-server/migration-guide/migration-zero-forty.md + reference/migration-guide/migration-zero-sixty: how-to/manage-the-zenml-server/migration-guide/migration-zero-sixty.md - getting-started/deploying-zenml/manage-the-deployed-services/upgrade-the-version-of-the-zenml-server.md: how-to/manage-the-zenml-server/upgrade-zenml-server.md - getting-started/deploying-zenml/manage-the-deployed-services/troubleshoot-your-deployed-server.md: how-to/manage-the-zenml-server/troubleshoot-your-deployed-server.md - how-to/stack-deployment/implement-a-custom-integration.md: how-to/contribute-to-zenml/implement-a-custom-integration.md - - getting-started/zenml-pro/system-architectures: getting-started/system-architectures.md \ No newline at end of file + getting-started/deploying-zenml/manage-the-deployed-services/upgrade-the-version-of-the-zenml-server: how-to/manage-the-zenml-server/upgrade-zenml-server.md + getting-started/deploying-zenml/manage-the-deployed-services/troubleshoot-your-deployed-server: how-to/manage-the-zenml-server/troubleshoot-your-deployed-server.md + how-to/stack-deployment/implement-a-custom-integration: how-to/contribute-to-zenml/implement-a-custom-integration.md + how-to/setting-up-a-project-repository/best-practices: how-to/setting-up-a-project-repository/set-up-repository.md + getting-started/zenml-pro/system-architectures: getting-started/system-architectures.md diff --git a/.github/workflows/ci-fast.yml b/.github/workflows/ci-fast.yml index b2f0b8626b9..40eae8107ce 100644 --- a/.github/workflows/ci-fast.yml +++ b/.github/workflows/ci-fast.yml @@ -86,6 +86,10 @@ jobs: if: github.event.pull_request.draft == false strategy: matrix: + # IMPORTANT: Since we are using the combination of `arc-runner-set` + # and `3.10` in our `ci-fast` workflow, this combination has been + # excluded from the `ci-slow` workflow. If you change the configuration + # here, please adjust the configuration of `ci-slow` accordingly. os: [arc-runner-set] python-version: ['3.10'] fail-fast: false @@ -98,6 +102,11 @@ jobs: if: github.event.pull_request.draft == false strategy: matrix: + # IMPORTANT: Since we are using the combinations of `arc-runner-set` + # `3.10` and two different test environments in our `ci-fast` workflow, + # these combination have been excluded from the `ci-slow` workflow. + # If you change the configuration here, please adjust the configuration + # of `ci-slow` accordingly. os: [arc-runner-set] python-version: ['3.10'] test_environment: [default, docker-server-docker-orchestrator-mysql] diff --git a/.github/workflows/ci-slow.yml b/.github/workflows/ci-slow.yml index 8053c2f0abd..7bca266e494 100644 --- a/.github/workflows/ci-slow.yml +++ b/.github/workflows/ci-slow.yml @@ -39,23 +39,6 @@ jobs: run: | echo "Please add the 'run-slow-ci' label to this PR before merging." exit 1 - docstring-check: - if: github.event.pull_request.draft == false - needs: run-slow-ci-label-is-set - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4.1.1 - - name: Set up Python - uses: actions/setup-python@v5.0.0 - with: - python-version: '3.10' - - name: Install current package as editable - run: | - curl -LsSf https://astral.sh/uv/install.sh | sh - source $HOME/.cargo/env - uv pip install --system darglint - - name: Check docstrings - run: bash scripts/docstring.sh mysql-db-migration-testing-full: if: github.event.pull_request.draft == false needs: run-slow-ci-label-is-set @@ -195,6 +178,12 @@ jobs: matrix: os: [arc-runner-set] python-version: ['3.9', '3.10', '3.11', '3.12'] + # IMPORTANT: Since we are using the following combination + # in our `ci-fast` workflow, this combination has been + # excluded from the `ci-slow` workflow. + exclude: + - os: arc-runner-set + python-version: '3.10' fail-fast: false uses: ./.github/workflows/unit-test.yml with: @@ -294,6 +283,15 @@ jobs: python-version: '3.11' - test_environment: docker-server-docker-orchestrator-mariadb python-version: '3.12' + # IMPORTANT: Since we are using the following combinations + # in our `ci-fast` workflow, this combination has been + # excluded from the `ci-slow` workflow. + - os: arc-runner-set + test_environment: default + python-version: '3.10' + - os: arc-runner-set + test_environment: docker-server-docker-orchestrator-mysql + python-version: '3.10' fail-fast: false uses: ./.github/workflows/integration-test-slow.yml with: diff --git a/.github/workflows/release_finalize.yml b/.github/workflows/release_finalize.yml index 4bec32f323f..9f486782a7a 100644 --- a/.github/workflows/release_finalize.yml +++ b/.github/workflows/release_finalize.yml @@ -2,48 +2,20 @@ name: release-finalize on: workflow_dispatch: - pull_request: - types: [closed] - branches: ["misc/prepare-release-*"] + inputs: + latest_version: + description: "The latest version of ZenML" + required: true + type: string + new_version: + description: "The new version of ZenML" + required: true + type: string env: ZENML_ANALYTICS_OPT_IN: false GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} jobs: - fetch-versions: - if: github.repository == 'zenml-io/zenml' && github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, 'misc/prepare-release-') - runs-on: ubuntu-latest - outputs: - old_version: ${{ steps.old-version.outputs.old_version }} - new_version: ${{ steps.new-version.outputs.new_version }} - steps: - # Extract the version - - name: Extract version from branch name - id: new-version - run: | - BRANCH_NAME=${GITHUB_REF#refs/heads/} - NEW_VERSION=${BRANCH_NAME#misc/prepare-release-} - echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT - # Checkout main as develop is already changed - - name: Checkout code - id: checkout-code - uses: actions/checkout@v4.1.1 - with: - ref: main - # Configure Git - - name: Configure git - shell: bash - run: | - git config --global user.email "info@zenml.io" - git config --global user.name "ZenML GmbH" - # Extract the old version - - name: Fetch the old version - id: old-version - run: | - LATEST_RELEASE=$(gh release view --json tagName,publishedAt -q '{tag: .tagName, date: .publishedAt}') - OLD_VERSION=$(echo "$LATEST_RELEASE" | jq -r .tag) - echo "old_version=$OLD_VERSION" >> $GITHUB_OUTPUT create-release-branch: - needs: fetch-versions runs-on: ubuntu-latest steps: # Configure Git @@ -60,10 +32,10 @@ jobs: # Create the release branch - name: Release branch run: | - git checkout -b release/${{ needs.fetch-versions.outputs.new_version }} - git push --set-upstream origin release/${{ needs.fetch-versions.outputs.new_version }} + git pull + git checkout -b release/${{ github.event.inputs.latest_version }} + git push --set-upstream origin release/${{ github.event.inputs.new_version }} add-docs-warning-header: - needs: fetch-versions runs-on: ubuntu-latest steps: # Configure Git @@ -76,14 +48,14 @@ jobs: - name: Checkout code uses: actions/checkout@v4.1.1 with: - ref: release/${{ needs.fetch-versions.outputs.old_version }} + ref: release/${{ github.event.inputs.latest_version }} # Create the docs update PR - name: Create docs update PR shell: bash run: | - bash scripts/add-docs-warning.sh ${{ needs.fetch-versions.outputs.old_version }} + git pull + bash scripts/add-docs-warning.sh ${{ github.event.inputs.latest_version }} add-new-version-to-migration-tests: - needs: fetch-versions runs-on: ubuntu-latest steps: # Configure Git @@ -101,9 +73,9 @@ jobs: - name: Create docs update PR shell: bash run: |- - bash scripts/add-migration-test-version.sh ${{ needs.fetch-versions.outputs.old_version }} ${{ needs.fetch-versions.outputs.new_version }} + git pull + bash scripts/add-migration-test-version.sh ${{ github.event.inputs.latest_version }} ${{ github.event.inputs.new_version }} order-gitbook-release-spaces: - needs: fetch-versions runs-on: ubuntu-latest steps: # Check out develop @@ -124,15 +96,15 @@ jobs: # Adjust the docs - name: Adjust gitbook docs env: - ZENML_NEW_VERSION: ${{ needs.fetch-versions.outputs.new_version } - ZENML_OLD_VERSION: ${{ needs.fetch-versions.outputs.old_version } + ZENML_NEW_VERSION: ${{ github.event.inputs.new_version }} + ZENML_OLD_VERSION: ${{ github.event.inputs.latest_version }} GITBOOK_API_KEY: ${{secrets.GITBOOK_API_KEY}} GITBOOK_ORGANIZATION: ${{secrets.GITBOOK_ORGANIZATION}} GITBOOK_DOCS_COLLECTION: ${{secrets.GITBOOK_DOCS_COLLECTION}} GITBOOK_LEGACY_COLLECTION: ${{secrets.GITBOOK_LEGACY_COLLECTION}} run: python scripts/sync-gitbook-release-spaces.py deprecate-docs-gitbook-legacy: - needs: [fetch-versions, order-gitbook-release-spaces] + needs: order-gitbook-release-spaces runs-on: ubuntu-latest steps: # Configure Git @@ -150,4 +122,4 @@ jobs: - name: Update legacy docs file shell: bash run: |- - bash scripts/deprecate-previous-docs-to-legacy.sh ${{ needs.fetch-versions.outputs.old_version }} + bash scripts/deprecate-previous-docs-to-legacy.sh ${{ github.event.inputs.latest_version }} diff --git a/.github/workflows/release_prepare.yml b/.github/workflows/release_prepare.yml index 43555782119..1d60f6c3b0c 100644 --- a/.github/workflows/release_prepare.yml +++ b/.github/workflows/release_prepare.yml @@ -1,16 +1,15 @@ --- name: release-prepare on: - workflow_dispatch: - create: - branches: ["misc/prepare-release-*"] - types: [branch] + push: + branches: + - "misc/prepare-release-*" env: ZENML_ANALYTICS_OPT_IN: false GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} jobs: fetch-versions: - if: github.repository == 'zenml-io/zenml' + if: github.event.created && github.repository == 'zenml-io/zenml' runs-on: ubuntu-latest outputs: old_version: ${{ steps.old-version.outputs.old_version }} @@ -241,4 +240,4 @@ jobs: sed -i 's|zenml\[server\]==[^[:space:]]*|git+https://github.com/zenml-io/zenml.git@${{ github.ref }}#egg=zenml[server]|g' requirements_${{ matrix.cloud }}.txt pip install -r requirements_${{ matrix.cloud }}.txt zenml integration install ${{ matrix.cloud }} -y - python run.py --model_type=t5-small + python run.py --model_type=t5-small --no-cache diff --git a/README.md b/README.md index 373e86b4577..5f2a19d31ec 100644 --- a/README.md +++ b/README.md @@ -327,7 +327,7 @@ the Apache License Version 2.0. Projects Showcase

- 🎉 Version 0.67.0 is out. Check out the release notes + 🎉 Version 0.68.0 is out. Check out the release notes here.
🖥️ Download our VS Code Extension here. diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 1131caeb096..40d6fdb7b66 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,134 @@ +# 0.68.0 + +## Highlights + +- **Stack Components on the Dashboard:** We're bringing back stack components. +With this release you will get access to the list of your stack components on +the ZenML dashboard. More functionality is going to follow in the next releases. +- **Client-Side Caching:** Implemented client-side computation for cached steps, +significantly reducing time and costs associated with remote orchestrator +spin-up. +- **Streamlined Onboarding Process:** Unified the starter and production setup +into a single sequential flow, providing a more intuitive user experience. +- **BentoML Integration:** Updated to version 1.3.5 with enhanced +containerization support. +- **Artifact Management:** Introduced `register_artifact` function enabling +direct linking of existing data in the artifact store, particularly useful +for tools like PyTorch-Lightning that manage their own checkpoints. +- **Enhanced Error Handling:** Added Error Boundary to visualization components +for improved reliability and user experience. + +## Additional Features and Improvements + +- Added multiple access points for deleting pipeline runs +- Improved pipeline detail view functionality +- Improved service account handling for Kaniko image builder + +## Breaking Changes and Deprecations + +- Discontinued Python 3.8 support +- Removed legacy pipeline and step interface +- Removed legacy post execution workflow +- Removed legacy dashboard option +- Removed `zenml stack up/down` CLI commands +- Removed `zenml deploy` and `zenml deploy` +- Removed `StepEnvironment` class +- Removed `ArtifactConfig` class for model version specification +- Removed `ExternalArtifact` class +- Deprecated `Client.list_runs` in favor of `Client.list_pipeline_runs` +- Deprecated `ArtifactVersionResponse.read` in favor of `ArtifactVersionResponse.load` + +## Documentation Updates + +Added new guides for the following topics: + +- Kubernetes per-pod configuration +- Factory generation of artifact names +- Common stacks best practices +- Azure 1-click dashboard deployment +- ZenML server upgrade best practices +- Custom Dataset classes and Materializers +- Comprehensive ZenML Pro documentation +- Image building optimization during pipeline runs +- Enhanced BentoML integration documentation + +## What's Changed +* Release 0.67.0 migration testing by @bcdurak in https://github.com/zenml-io/zenml/pull/3050 +* Prevent too large requests by @avishniakov in https://github.com/zenml-io/zenml/pull/3048 +* Fix Neptune linting after 1.12.0 release by @avishniakov in https://github.com/zenml-io/zenml/pull/3055 +* Fix Lightning Orchestrator (remove -y from pip install) by @wjayesh in https://github.com/zenml-io/zenml/pull/3058 +* Fix artifact pruning endpoint path by @schustmi in https://github.com/zenml-io/zenml/pull/3052 +* Update python versioning in docs by @avishniakov in https://github.com/zenml-io/zenml/pull/3059 +* Fix infinite loop while fetching artifact store in logs storage class by @avishniakov in https://github.com/zenml-io/zenml/pull/3061 +* Make sync a setting for sagemaker/azureml orchestrator by @schustmi in https://github.com/zenml-io/zenml/pull/3062 +* Remove some deprecated features by @schustmi in https://github.com/zenml-io/zenml/pull/2926 +* Fix MySQL warning when filtering pipelines by latest run by @schustmi in https://github.com/zenml-io/zenml/pull/3051 +* Remove more deprecated stuff by @schustmi in https://github.com/zenml-io/zenml/pull/3063 +* Remove log versions from versioned buckets in S3 by @avishniakov in https://github.com/zenml-io/zenml/pull/3060 +* add docs on k8s per pod settings by @wjayesh in https://github.com/zenml-io/zenml/pull/3066 +* Remove Python 3.8 support by @strickvl in https://github.com/zenml-io/zenml/pull/3034 +* `register_artifact ` function by @avishniakov in https://github.com/zenml-io/zenml/pull/3053 +* Fix bad link in docs by @avishniakov in https://github.com/zenml-io/zenml/pull/3069 +* Fix model linkage for the lazy loading scenarios by @avishniakov in https://github.com/zenml-io/zenml/pull/3054 +* Updating template versions after the Python 3.8 changes by @bcdurak in https://github.com/zenml-io/zenml/pull/3070 +* Add UUID materializer by @htahir1 in https://github.com/zenml-io/zenml/pull/3073 +* Fix pipeline and model URLs for ZenML Pro on-prem deployments by @stefannica in https://github.com/zenml-io/zenml/pull/3083 +* Update bentoml integration to 1.3.5 and add containerization by @wjayesh in https://github.com/zenml-io/zenml/pull/3045 +* Fix mlflow linting by @schustmi in https://github.com/zenml-io/zenml/pull/3085 +* Add docs for factory generation of artifact names by @strickvl in https://github.com/zenml-io/zenml/pull/3084 +* Remove unnecessary metadata fields in UUID materializer test by @htahir1 in https://github.com/zenml-io/zenml/pull/3088 +* Client-side computation of cached steps by @schustmi in https://github.com/zenml-io/zenml/pull/3068 +* Fix Kaniko image builder service account passing by @schustmi in https://github.com/zenml-io/zenml/pull/3081 +* Bugfix in GitLab Code Repository integration by @4gt-104 in https://github.com/zenml-io/zenml/pull/3076 +* Add docs on common stacks best practices by @strickvl in https://github.com/zenml-io/zenml/pull/3092 +* [docs] Update stacks page and add azure 1-click from dashboard docs by @wjayesh in https://github.com/zenml-io/zenml/pull/3082 +* Local development how-to section by @strickvl in https://github.com/zenml-io/zenml/pull/3090 +* [docs] best practices for upgrading zenml server by @wjayesh in https://github.com/zenml-io/zenml/pull/3087 +* Fix S3 ArtifactStore auth issue by @avishniakov in https://github.com/zenml-io/zenml/pull/3086 +* Reduce migration testing runtime by @avishniakov in https://github.com/zenml-io/zenml/pull/3078 +* [docs] Dedicated docs on how to skip building an image on pipeline run by @wjayesh in https://github.com/zenml-io/zenml/pull/3079 +* Fix filtering by tag for pipeline runs by @schustmi in https://github.com/zenml-io/zenml/pull/3097 +* Remove deprecated features: `zenml deploy` and `zenml deploy` by @stefannica in https://github.com/zenml-io/zenml/pull/3089 +* Do not tag model via `Model` class on creation by @avishniakov in https://github.com/zenml-io/zenml/pull/3098 +* Sagemaker add pipeline tags by @htahir1 in https://github.com/zenml-io/zenml/pull/3080 +* [docs] Add custom Dataset classes and Materializers in ZenML by @htahir1 in https://github.com/zenml-io/zenml/pull/3091 +* Delete Scarf related scripts and workflow files by @htahir1 in https://github.com/zenml-io/zenml/pull/3103 +* Add more detailed docs for ZenML Pro by @wjayesh in https://github.com/zenml-io/zenml/pull/3065 +* Add missing code hash filter in client method by @schustmi in https://github.com/zenml-io/zenml/pull/3094 +* Remove lineage graph and legacy dashboard support by @schustmi in https://github.com/zenml-io/zenml/pull/3064 +* Add unittest to cover gitlab CR regex. by @4gt-104 in https://github.com/zenml-io/zenml/pull/3102 +* Automating the release process using Github workflows by @bcdurak in https://github.com/zenml-io/zenml/pull/3101 +* Bugfix for release automation by @bcdurak in https://github.com/zenml-io/zenml/pull/3107 +* Bugfix for new version in the release automation by @bcdurak in https://github.com/zenml-io/zenml/pull/3108 +* using the right parent image name by @bcdurak in https://github.com/zenml-io/zenml/pull/3109 +* Making the new release automation scripts executable by @bcdurak in https://github.com/zenml-io/zenml/pull/3110 +* Fixing the env variables for the release automation by @bcdurak in https://github.com/zenml-io/zenml/pull/3111 +* Adding the right Github configuration before using the `gh` CLI to fetch the version by @bcdurak in https://github.com/zenml-io/zenml/pull/3112 +* Fixing the outputs of the first step in the release automation by @bcdurak in https://github.com/zenml-io/zenml/pull/3113 +* Handling github auth and release notes for release automation by @bcdurak in https://github.com/zenml-io/zenml/pull/3114 +* Fixing the cloudbuild call for release automation by @bcdurak in https://github.com/zenml-io/zenml/pull/3116 +* Fixing the update tenant call in the script by @bcdurak in https://github.com/zenml-io/zenml/pull/3118 +* Release automation with the new redeploy logic by @bcdurak in https://github.com/zenml-io/zenml/pull/3120 +* Fixing the automation triggers for other branches by @bcdurak in https://github.com/zenml-io/zenml/pull/3125 +* Update link for `llm-complete-guide` repository.- Updated link to poi… by @htahir1 in https://github.com/zenml-io/zenml/pull/3128 +* Fixing the migration testing for the release branches by @bcdurak in https://github.com/zenml-io/zenml/pull/3127 +* Update pipeline deletion docs by @strickvl in https://github.com/zenml-io/zenml/pull/3123 +* Disabling the cache for the quickstart tests by @bcdurak in https://github.com/zenml-io/zenml/pull/3133 +* Update Argilla integration for v2.x SDK by @sdiazlor in https://github.com/zenml-io/zenml/pull/2915 +* Using pip instead of `gh` CLI in the migration tests by @bcdurak in https://github.com/zenml-io/zenml/pull/3136 +* Adapting tags to work with older versions of Sagemaker by @bcdurak in https://github.com/zenml-io/zenml/pull/3135 +* Manual trigger for the `release_finalize` workflow by @bcdurak in https://github.com/zenml-io/zenml/pull/3137 +* Fixing the prepare trigger for the release automation by @bcdurak in https://github.com/zenml-io/zenml/pull/3138 + +## New Contributors +* @4gt-104 made their first contribution in https://github.com/zenml-io/zenml/pull/3076 +* @sdiazlor made their first contribution in https://github.com/zenml-io/zenml/pull/2915 + +**Full Changelog**: https://github.com/zenml-io/zenml/compare/0.67.0...0.68.0 + + # 0.67.0 ## Highlights diff --git a/docs/book/.gitbook/assets/argilla_annotator.png b/docs/book/.gitbook/assets/argilla_annotator.png index 4cde7bac206..8b01f3744fd 100644 Binary files a/docs/book/.gitbook/assets/argilla_annotator.png and b/docs/book/.gitbook/assets/argilla_annotator.png differ diff --git a/docs/book/.gitbook/assets/data_scientist_connector_role.png b/docs/book/.gitbook/assets/data_scientist_connector_role.png new file mode 100644 index 00000000000..ac19b31fa3d Binary files /dev/null and b/docs/book/.gitbook/assets/data_scientist_connector_role.png differ diff --git a/docs/book/.gitbook/assets/model_pipeline_artifact.png b/docs/book/.gitbook/assets/model_pipeline_artifact.png new file mode 100644 index 00000000000..38296835027 Binary files /dev/null and b/docs/book/.gitbook/assets/model_pipeline_artifact.png differ diff --git a/docs/book/.gitbook/assets/platform_engineer_connector_role.png b/docs/book/.gitbook/assets/platform_engineer_connector_role.png new file mode 100644 index 00000000000..d403c22f52e Binary files /dev/null and b/docs/book/.gitbook/assets/platform_engineer_connector_role.png differ diff --git a/docs/book/component-guide/annotators/annotators.md b/docs/book/component-guide/annotators/annotators.md index f0592a1eb3f..eed8c494e28 100644 --- a/docs/book/component-guide/annotators/annotators.md +++ b/docs/book/component-guide/annotators/annotators.md @@ -55,7 +55,7 @@ The core parts of the annotation workflow include: ### List of available annotators For production use cases, some more flavors can be found in specific `integrations` modules. In terms of annotators, -ZenML features integrations with `label_studio` and `pigeon`. +ZenML features integrations with the following tools. | Annotator | Flavor | Integration | Notes | |-----------------------------------------|----------------|----------------|----------------------------------------------------------------------| diff --git a/docs/book/component-guide/annotators/argilla.md b/docs/book/component-guide/annotators/argilla.md index b0ed6f92a53..b136e0a4cd9 100644 --- a/docs/book/component-guide/annotators/argilla.md +++ b/docs/book/component-guide/annotators/argilla.md @@ -4,12 +4,7 @@ description: Annotating data using Argilla. # Argilla -[Argilla](https://github.com/argilla-io/argilla) is an open-source data curation -platform designed to enhance the development of both small and large language -models (LLMs) and NLP tasks in general. It enables users to build robust -language models through faster data curation using both human and machine -feedback, providing support for each step in the MLOps cycle, from data labeling -to model monitoring. +[Argilla](https://github.com/argilla-io/argilla) is a collaboration tool for AI engineers and domain experts who need to build high-quality datasets for their projects. It enables users to build robust language models through faster data curation using both human and machine feedback, providing support for each step in the MLOps cycle, from data labeling to model monitoring. ![Argilla Annotator](../../.gitbook/assets/argilla_annotator.png) @@ -31,7 +26,7 @@ of Argilla as well as a deployed instance of Argilla. There is an easy way to deploy Argilla as a [Hugging Face Space](https://huggingface.co/docs/hub/spaces-sdks-docker-argilla), for instance, which is documented in the [Argilla -documentation](https://docs.argilla.io/en/latest/getting_started/installation/deployments/huggingface-spaces.html). +documentation](https://docs.argilla.io/latest/getting_started/quickstart/). ### How to deploy it? @@ -59,16 +54,16 @@ zenml secret create argilla_secrets --api_key="" Then register your annotator with ZenML: ```shell -zenml annotator register argilla --flavor argilla --authentication_secret=argilla_secrets +zenml annotator register argilla --flavor argilla --authentication_secret=argilla_secrets --port=6900 ``` When using a deployed instance of Argilla, the instance URL must be specified without any trailing `/` at the end. If you are using a Hugging Face Spaces instance and its visibility is set to private, you must also set the -`extra_headers` parameter which would include a Hugging Face token. For example: +`headers` parameter which would include a Hugging Face token. For example: ```shell -zenml annotator register argilla --flavor argilla --authentication_secret=argilla_secrets --instance_url="https://[your-owner-name]-[your_space_name].hf.space" --extra_headers="{"Authorization": f"Bearer {}"}" +zenml annotator register argilla --flavor argilla --authentication_secret=argilla_secrets --instance_url="https://[your-owner-name]-[your_space_name].hf.space" --headers='{"Authorization": "Bearer {[your_hugging_face_token]}"}' ``` Finally, add all these components to a stack and set it as your active stack. @@ -95,9 +90,8 @@ functionality via the ZenML SDK. You can access information about the datasets you're using with the `zenml annotator dataset list`. To work on annotation for a particular dataset, you can -run `zenml annotator dataset annotate `. What follows is an -overview of some key components to the Argilla integration and how it can be -used. +run `zenml annotator dataset annotate `. This will open the Argilla +web interface for you to start annotating the dataset. #### Argilla Annotator Stack Component diff --git a/docs/book/how-to/build-pipelines/delete-a-pipeline.md b/docs/book/how-to/build-pipelines/delete-a-pipeline.md index dbd4e9cff8f..a92c7232615 100644 --- a/docs/book/how-to/build-pipelines/delete-a-pipeline.md +++ b/docs/book/how-to/build-pipelines/delete-a-pipeline.md @@ -26,6 +26,34 @@ Deleting a pipeline does not automatically delete any of its associated runs or artifacts. {% endhint %} +If you want to delete multiple pipelines at once, you might find the Python SDK +preferable. If you have pipelines with the same prefix, you will need to pass in +the `id` of each pipeline separately so ZenML is able to identify them. In this +case, you could use a script like the following: + +```python +from zenml.client import Client + +client = Client() + +# Get the list of pipelines that start with "test_pipeline" +# use a large size to ensure we get all of them +pipelines_list = client.list_pipelines(name="startswith:test_pipeline", size=100) + +target_pipeline_ids = [p.id for p in pipelines_list.items] + +print(f"Found {len(target_pipeline_ids)} pipelines to delete") + +confirmation = input("Do you really want to delete these pipelines? (y/n): ").lower() + +if confirmation == 'y': + print(f"Deleting {len(target_pipeline_ids)} pipelines") + for pid in target_pipeline_ids: + client.delete_pipeline(pid) + print("Deletion complete") +else: + print("Deletion cancelled") +``` ## Delete a pipeline run diff --git a/docs/book/how-to/customize-docker-builds/how-to-reuse-builds.md b/docs/book/how-to/customize-docker-builds/how-to-reuse-builds.md index bffce76a90e..9f278b28848 100644 --- a/docs/book/how-to/customize-docker-builds/how-to-reuse-builds.md +++ b/docs/book/how-to/customize-docker-builds/how-to-reuse-builds.md @@ -33,7 +33,7 @@ While reusing Docker builds is useful, it can be limited. This is because specif ## Use the artifact store to upload your code -You can also let ZenML use the artifact store to upload your code. This is the default behaviour if no code repository is detected and the `allow_download_from_artifact_store` flag is not set to `False` in your `DockerSettings`. +You can also let ZenML use the artifact store to upload your code. This is the default behavior if no code repository is detected and the `allow_download_from_artifact_store` flag is not set to `False` in your `DockerSettings`. ## Use code repositories to speed up Docker build times diff --git a/docs/book/how-to/customize-docker-builds/how-to-use-a-private-pypi-repository.md b/docs/book/how-to/customize-docker-builds/how-to-use-a-private-pypi-repository.md new file mode 100644 index 00000000000..344a5b5e891 --- /dev/null +++ b/docs/book/how-to/customize-docker-builds/how-to-use-a-private-pypi-repository.md @@ -0,0 +1,44 @@ +--- +description: How to use a private PyPI repository. +--- + +# How to use a private PyPI repository + +For packages that require authentication, you may need to take additional steps: + +1. Use environment variables to store credentials securely. +2. Configure pip or poetry to use these credentials when installing packages. +3. Consider using custom Docker images that have the necessary authentication setup. + +Here's an example of how you might set up authentication using environment variables: + +```python +import os + +from my_simple_package import important_function +from zenml.config import DockerSettings +from zenml import step, pipeline + +docker_settings = DockerSettings( + requirements=["my-simple-package==0.1.0"], + environment={'PIP_EXTRA_INDEX_URL': f"https://{os.environ.get('PYPI_TOKEN', '')}@my-private-pypi-server.com/{os.environ.get('PYPI_USERNAME', '')}/"} +) + +@step +def my_step(): + return important_function() + +@pipeline(settings={"docker": docker_settings}) +def my_pipeline(): + my_step() + +if __name__ == "__main__": + my_pipeline() +``` + +Note: Be cautious with handling credentials. Always use secure methods to manage +and distribute authentication information within your team. + +
ZenML Scarf
+ + diff --git a/docs/book/how-to/manage-the-zenml-server/best-practices-upgrading-zenml.md b/docs/book/how-to/manage-the-zenml-server/best-practices-upgrading-zenml.md index dc6a397ab4f..c96ba3cf778 100644 --- a/docs/book/how-to/manage-the-zenml-server/best-practices-upgrading-zenml.md +++ b/docs/book/how-to/manage-the-zenml-server/best-practices-upgrading-zenml.md @@ -42,7 +42,7 @@ ZenML Pro comes with multi-tenancy which makes it easy for you to have multiple ## Upgrading your code -Sometimes, you might have to upgrade your code to work with a new version of ZenML. This is true especially when you are moving from a really old version to a new major version. The following tips might help, in addition to everything you've learnt in this document so far. +Sometimes, you might have to upgrade your code to work with a new version of ZenML. This is true especially when you are moving from a really old version to a new major version. The following tips might help, in addition to everything you've learned in this document so far. ### Testing and Compatibility diff --git a/docs/book/how-to/setting-up-a-project-repository/README.md b/docs/book/how-to/setting-up-a-project-repository/README.md index c0cb737edd7..eb203046c2a 100644 --- a/docs/book/how-to/setting-up-a-project-repository/README.md +++ b/docs/book/how-to/setting-up-a-project-repository/README.md @@ -1,13 +1,93 @@ --- -description: Setting your team up for success with a project repository. +description: Setting your team up for success with a well-architected ZenML project. --- -# 😸 Setting up a project repository +# 😸 Setting up a Well-Architected ZenML Project -ZenML code typically lives in a `git` repository. Setting this repository up correctly can make a huge impact on collaboration and -getting the maximum out of your ZenML deployment. This section walks users through some of the options available to create a project -repository with ZenML. +Welcome to the guide on setting up a well-architected ZenML project. This section will provide you with a comprehensive overview of best practices, strategies, and considerations for structuring your ZenML projects to ensure scalability, maintainability, and collaboration within your team. -

A visual representation of how the code repository fits into the general ZenML architecture.

+## The Importance of a Well-Architected Project + +A well-architected ZenML project is crucial for the success of your machine learning operations (MLOps). It provides a solid foundation for your team to develop, deploy, and maintain ML models efficiently. By following best practices and leveraging ZenML's features, you can create a robust and flexible MLOps pipeline that scales with your needs. + +## Key Components of a Well-Architected ZenML Project + +### Repository Structure + +A clean and organized repository structure is essential for any ZenML project. This includes: + +- Proper folder organization for pipelines, steps, and configurations +- Clear separation of concerns between different components +- Consistent naming conventions + +Learn more about setting up your repository in the [Set up repository guide](./best-practices.md). + +### Version Control and Collaboration + +Integrating your ZenML project with version control systems like Git is crucial for team collaboration and code management. This allows for: + +- Makes creating pipeline builds faster, as you can leverage the same image and [have ZenML download code from your repository](../../how-to/customize-docker-builds/how-to-reuse-builds.md#use-code-repositories-to-speed-up-docker-build-times). +- Easy tracking of changes +- Collaboration among team members + +Discover how to connect your Git repository in the [Set up a repository guide](./best-practices.md). + +### Stacks, Pipelines, Models, and Artifacts + +Understanding the relationship between stacks, models, and pipelines is key to designing an efficient ZenML project: + +- Stacks: Define your infrastructure and tool configurations +- Models: Represent your machine learning models and their metadata +- Pipelines: Encapsulate your ML workflows +- Artifacts: Track your data and model outputs + +Learn about organizing these components in the [Organizing Stacks, Pipelines, Models, and Artifacts guide](./stacks-pipelines-models.md). + +### Access Management and Roles + +Proper access management ensures that team members have the right permissions and responsibilities: + +- Define roles such as data scientists, MLOps engineers, and infrastructure managers +- Set up [service connectors](../auth-management/README.md) and manage authorizations +- Establish processes for pipeline maintenance and server upgrades +- Leverage [Teams in ZenML Pro](../../getting-started/zenml-pro/teams.md) to assign roles and permissions to a group of users, to mimic your real-world team roles. + +Explore access management strategies in the [Access Management and Roles guide](./access-management-and-roles.md). + +### Shared Components and Libraries + +Leverage shared components and libraries to promote code reuse and standardization across your team: + +- Custom flavors, steps, and materializers +- Shared private wheels for internal distribution +- Handling authentication for specific libraries + +Find out more about sharing code in the [Shared Libraries and Logic for Teams guide](./shared_components_for_teams.md). + +### Project Templates + +Utilize project templates to kickstart your ZenML projects and ensure consistency: + +- Use pre-made templates for common use cases +- Create custom templates tailored to your team's needs + +Learn about using and creating project templates in the [Project Templates guide](./project-templates.md). + +### Migration and Maintenance + +As your project evolves, you may need to migrate existing codebases or upgrade your ZenML server: + +- Strategies for migrating legacy code to newer ZenML versions +- Best practices for upgrading ZenML servers + +Discover migration strategies and maintenance best practices in the [Migration and Maintenance guide](../../how-to/manage-the-zenml-server/best-practices-upgrading-zenml.md#upgrading-your-code). + +## Getting Started + +To begin building your well-architected ZenML project, start by exploring the guides in this section. Each guide provides in-depth information on specific aspects of project setup and management. + +Remember, a well-architected project is an ongoing process. Regularly review and refine your project structure, processes, and practices to ensure they continue to meet your team's evolving needs. + +By following these guidelines and leveraging ZenML's powerful features, you'll be well on your way to creating a robust, scalable, and collaborative MLOps environment.
ZenML Scarf
diff --git a/docs/book/how-to/setting-up-a-project-repository/access-management.md b/docs/book/how-to/setting-up-a-project-repository/access-management.md new file mode 100644 index 00000000000..8e12187a73c --- /dev/null +++ b/docs/book/how-to/setting-up-a-project-repository/access-management.md @@ -0,0 +1,93 @@ +--- +description: A guide on managing user roles and responsibilities in ZenML. +--- + +# Access Management and Roles in ZenML + +Effective access management is crucial for maintaining security and efficiency in your ZenML projects. This guide will help you understand the different roles within a ZenML server and how to manage access for your team members. + +## Typical Roles in an ML Project + +In an ML project, you will typically have the following roles: + +- Data Scientists: Primarily work on developing and running pipelines. +- MLOps Platform Engineers: Manage the infrastructure and stack components. +- Project Owners: Oversee the entire ZenML deployment and manage user access. + +The above is an estimation of roles that you might have in your team. In your case, the names might be different or there might be more roles, but you can relate the responbilities we discuss in this document to your own project loosely. + +{% hint style="info" %} +You can create [Roles in ZenML Pro](../../getting-started/zenml-pro/roles.md) with a given set of permissions and assign them to either Users or Teams that represent your real-world team structure. Sign up for a free trial to try it yourself: https://cloud.zenml.io/ +{% endhint %} + +## Service Connectors: Gateways to External Services + +Service connectors are how different cloud services are integrated with ZenML. They are used to abstract away the credentials and other configurations needed to access these services. + +Ideally, you would want that only the MLOps Platform Engineers have access for creating and managing connectors. This is because they are closest to your infrastructure and can make informed decisions about what authentication mechanisms to use and more. + +Other team members can use connectors to create stack components that talk to the external services but should not have to worry about setting them and shouldn't have access to the credentials used to configure them. + +Let's look at an example of how this works in practice. +Imagine you have a `DataScientist` role in your ZenML server. This role should only be able to use the connectors to create stack components and run pipelines. They shouldn't have access to the credentials used to configure these connectors. Therefore, the permissions for this role could look like the following: + +![Data Scientist Permissions](../../.gitbook/assets/data_scientist_connector_role.png) + +You can notice that the role doesn't grant the data scientist permissions to create, update, or delete connectors, or read their secret values. + +On the other hand, the `MLOpsPlatformEngineer` role has the permissions to create, update, and delete connectors, as well as read their secret values. The permissions for this role could look like the following: + +![MLOps Platform Engineer Permissions](../../.gitbook/assets/platform_engineer_connector_role.png) + +{% hint style="info" %} +Note that you can only use the RBAC features in ZenML Pro. Learn more about roles in ZenML Pro [here](../../getting-started/zenml-pro/roles.md). +{% endhint %} + +Learn more about the best practices in managing credentials and recommended roles in our [Managing Stacks and Components guide](../stack-deployment/README.md). + + +## Who is responsible for upgrading the ZenML server? + +The decision to upgrade your ZenML server is usually taken by your Project Owners after consulting with all the teams using the server. This is because there might be teams with conflicting requirements and moving to a new version of ZenML (that might come with upgrades to certain libraries) can break code for some users. + +{% hint style="info" %} +You can choose to have different servers for different teams and that can alleviate some of the pressure to upgrade if you have multiple teams using the same server. ZenML Pro offers [multi-tenancy](../../getting-started/zenml-pro/tenants.md) out of the box, for situations like these. Sign up for a free trial to try it yourself: https://cloud.zenml.io/ +{% endhint %} + +Performing the upgrade itself is a task that typically falls on the MLOps Platform Engineers. They should: + +- ensure that all data is backed up before performing the upgrade +- no service disruption or downtime happens during the upgrade + +and more. Read in detail about the best practices for upgrading your ZenML server in the [Best Practices for Upgrading ZenML Servers](../manage-the-zenml-server/best-practices-upgrading-zenml.md) guide. + + +## Who is responsible for migrating and maintaining pipelines? + +When you upgrade to a new version of ZenML, you might have to test if your code works as expected and if the syntax is up to date with what ZenML expects. Although we do our best to make new releases compatible with older versions, there might be some breaking changes that you might have to address. + +The pipeline code itself is typically owned by the Data Scientist, but the Platform Engineer is responsible for making sure that new changes can be tested in a safe environment without impacting existing workflows. This involves setting up a new server and doing a staged upgrade and other strategies. + +The Data Scientist should also check out the release notes, and the migration guide where applicable when upgrading the code. Read more about the best practices for upgrading your ZenML server and your code in the [Best Practices for Upgrading ZenML Servers](../manage-the-zenml-server/best-practices-upgrading-zenml.md) guide. + + +## Best Practices for Access Management + +Apart from the role-specific tasks we discussed so far, there are some general best practices you should follow to ensure a secure and well-managed ZenML environment that supports collaboration while maintaining proper access controls. + +- Regular Audits: Conduct periodic reviews of user access and permissions. +- Role-Based Access Control (RBAC): Implement RBAC to streamline permission management. +- Least Privilege: Grant minimal necessary permissions to each role. +- Documentation: Maintain clear documentation of roles, responsibilities, and access policies. + +{% hint style="info" %} +The Role-Based Access Control (RBAC) and assigning of permissions is only available for ZenML Pro users. +{% endhint %} + +By following these guidelines, you can ensure a secure and well-managed ZenML environment that supports collaboration while maintaining proper access controls. + + + +
ZenML Scarf
+ + diff --git a/docs/book/how-to/setting-up-a-project-repository/connect-your-git-repository.md b/docs/book/how-to/setting-up-a-project-repository/connect-your-git-repository.md index f0eed7a4951..1c18178fc2d 100644 --- a/docs/book/how-to/setting-up-a-project-repository/connect-your-git-repository.md +++ b/docs/book/how-to/setting-up-a-project-repository/connect-your-git-repository.md @@ -8,6 +8,8 @@ description: >- A code repository in ZenML refers to a remote storage location for your code. Some commonly known code repository platforms include [GitHub](https://github.com/) and [GitLab](https://gitlab.com/). +

A visual representation of how the code repository fits into the general ZenML architecture.

+ Code repositories enable ZenML to keep track of the code version that you use for your pipeline runs. Additionally, running a pipeline that is tracked in a registered code repository can [speed up the Docker image building for containerized stack components](../customize-docker-builds/use-code-repositories-to-speed-up-docker-build-times.md) by eliminating the need to rebuild Docker images each time you change one of your source code files. Learn more about how code repositories benefit development [here](../customize-docker-builds/use-code-repositories-to-speed-up-docker-build-times.md). diff --git a/docs/book/how-to/setting-up-a-project-repository/create-your-own-template.md b/docs/book/how-to/setting-up-a-project-repository/create-your-own-template.md new file mode 100644 index 00000000000..710249bc21d --- /dev/null +++ b/docs/book/how-to/setting-up-a-project-repository/create-your-own-template.md @@ -0,0 +1,48 @@ +--- +description: How to create your own ZenML template. +--- + +# Create your own ZenML template + +Creating your own ZenML template is a great way to standardize and share your ML workflows across different projects or teams. ZenML uses [Copier](https://copier.readthedocs.io/en/stable/) to manage its project templates. Copier is a library that allows you to generate projects from templates. It's simple, versatile, and powerful. + +Here's a step-by-step guide on how to create your own ZenML template: + +1. **Create a new repository for your template.** This will be the place where you store all the code and configuration files for your template. +2. **Define your ML workflows as ZenML steps and pipelines.** You can start by copying the code from one of the existing ZenML templates (like the [starter template](https://github.com/zenml-io/template-starter)) and modifying it to fit your needs. +3. **Create a `copier.yml` file.** This file is used by Copier to define the template's parameters and their default values. You can learn more about this config file [in the copier docs](https://copier.readthedocs.io/en/stable/creating/). +4. **Test your template.** You can use the `copier` command-line tool to generate a new project from your template and check if everything works as expected: + +```bash +copier copy https://github.com/your-username/your-template.git your-project +``` + +Replace `https://github.com/your-username/your-template.git` with the URL of your template repository, and `your-project` with the name of the new project you want to create. + +5. **Use your template with ZenML.** Once your template is ready, you can use it with the `zenml init` command: + +```bash +zenml init --template https://github.com/your-username/your-template.git +``` + +Replace `https://github.com/your-username/your-template.git` with the URL of your template repository. + +If you want to use a specific version of your template, you can use the `--template-tag` option to specify the git tag of the version you want to use: + +```bash +zenml init --template https://github.com/your-username/your-template.git --template-tag v1.0.0 +``` + +Replace `v1.0.0` with the git tag of the version you want to use. + +That's it! Now you have your own ZenML project template that you can use to quickly set up new ML projects. Remember to keep your template up-to-date with the latest best practices and changes in your ML workflows. + +Our [Production Guide](../../user-guide/production-guide/README.md) documentation is built around the `E2E Batch` project template codes. Most examples will be based on it, so we highly recommend you to install the `e2e_batch` template with `--template-with-defaults` flag before diving deeper into this documentation section, so you can follow this guide along using your own local environment. + +```bash +mkdir e2e_batch +cd e2e_batch +zenml init --template e2e_batch --template-with-defaults +``` + +
ZenML Scarf
diff --git a/docs/book/how-to/setting-up-a-project-repository/best-practices.md b/docs/book/how-to/setting-up-a-project-repository/set-up-repository.md similarity index 76% rename from docs/book/how-to/setting-up-a-project-repository/best-practices.md rename to docs/book/how-to/setting-up-a-project-repository/set-up-repository.md index b24308c85e1..2cf4a8b952f 100644 --- a/docs/book/how-to/setting-up-a-project-repository/best-practices.md +++ b/docs/book/how-to/setting-up-a-project-repository/set-up-repository.md @@ -2,7 +2,7 @@ description: Recommended repository structure and best practices. --- -# Best practices +# Set up your repository While it doesn't matter how you structure your ZenML project, here is a recommended project structure the core team often uses: @@ -34,7 +34,20 @@ While it doesn't matter how you structure your ZenML project, here is a recommen └── run.py ``` -All ZenML [Project templates](using-project-templates.md#generating-project-from-a-project-template) are modeled around this basic structure. +All ZenML [Project +templates](using-project-templates.md#generating-project-from-a-project-template) +are modeled around this basic structure. The `steps` and `pipelines` folders +contain the steps and pipelines defined in your project. If your project is +simpler you can also just keep your steps at the top level of the `steps` folder +without the need so structure them in subfolders. + +{% hint style="info" %} +It might also make sense to register your repository as a code repository. These +enable ZenML to keep track of the code version that you use for your pipeline +runs. Additionally, running a pipeline that is tracked in [a registered code repository](./connect-your-git-repository.md) can speed up the Docker image building for containerized stack +components by eliminating the need to rebuild Docker images each time you change +one of your source code files. Learn more about these in [connecting your Git repository](https://docs.zenml.io/how-to/setting-up-a-project-repository/connect-your-git-repository). +{% endhint %} #### Steps @@ -87,7 +100,9 @@ Collect all your notebooks in one place. By running `zenml init` at the root of your project, you define the project scope for ZenML. In ZenML terms, this will be called your "source's root". This will be used to resolve import paths and store configurations. -Although this is optional, it is recommended that you do this for all of your projects. +Although this is optional, it is recommended that you do this for all of your +projects. This is especially important if you are using Jupyter noteeboks in +your project as these require you to have initialized a `.zen` file. {% hint style="warning" %} All of your import paths should be relative to the source's root. diff --git a/docs/book/how-to/setting-up-a-project-repository/shared-components-for-teams.md b/docs/book/how-to/setting-up-a-project-repository/shared-components-for-teams.md new file mode 100644 index 00000000000..d19e1d41c7d --- /dev/null +++ b/docs/book/how-to/setting-up-a-project-repository/shared-components-for-teams.md @@ -0,0 +1,138 @@ +--- +description: Sharing code and libraries within teams. +--- + +# Shared Libraries and Logic for Teams + +Teams often need to collaborate on projects, share versioned logic, and implement cross-cutting functionality that benefits the entire organization. Sharing code libraries allows for incremental improvements, increased robustness, and standardization across projects. + +This guide will cover two main aspects of sharing code within teams using ZenML: + +1. What can be shared +2. How to distribute shared components + +## What Can Be Shared + +ZenML offers several types of custom components that can be shared between teams: + +### Custom Flavors + +Custom flavors are special integrations that don't come built-in with ZenML. These can be implemented and shared as follows: + +1. Create the custom flavor in a shared repository. +2. Implement the custom stack component as described in the [ZenML documentation](../stack-deployment/implement-a-custom-stack-component.md#implementing-a-custom-stack-component-flavor). +3. Register the component using the ZenML CLI, for example in the case of a custom artifact store flavor: + +```bash +zenml artifact-store flavor register +``` + +### Custom Steps + +Custom steps can be created and shared via a separate repository. Team members can reference these components as they would normally reference Python modules. + +### Custom Materializers + +Custom materializers are common components that teams often need to share. To implement and share a custom materializer: + +1. Create the materializer in a shared repository. +2. Implement the custom materializer as described in the [ZenML documentation](https://docs.zenml.io/how-to/handle-data-artifacts/handle-custom-data-types). +3. Team members can import and use the shared materializer in their projects. + +## How to Distribute Shared Components + +There are several methods to distribute and use shared components within a team: + +### Shared Private Wheels + +Using shared private wheels is an effective approach to sharing code within a team. This method packages Python code for internal distribution without making it publicly available. + +#### Benefits of Using Shared Private Wheels + +- Packaged format: Easy to install using pip +- Version management: Simplifies managing different code versions +- Dependency management: Automatically installs specified dependencies +- Privacy: Can be hosted on internal PyPI servers +- Smooth integration: Imported like any other Python package + +#### Setting Up Shared Private Wheels + +1. Create a private PyPI server or use a service like [AWS CodeArtifact](https://aws.amazon.com/codeartifact/). +2. [Build your code](https://packaging.python.org/en/latest/tutorials/packaging-projects/) [into wheel format](https://opensource.com/article/23/1/packaging-python-modules-wheels). +3. Upload the wheel to your private PyPI server. +4. Configure pip to use the private PyPI server in addition to the public one. +5. Install the private packages using pip, just like public packages. + +### Using Shared Libraries with `DockerSettings` + +When running pipelines with remote orchestrators, ZenML generates a `Dockerfile` at runtime. You can use the `DockerSettings` class to specify how to include your shared libraries in this Docker image. + +#### Installing Shared Libraries + +Here are some ways to include shared libraries using `DockerSettings`. Either specify a list of requirements: + +```python +import os +from zenml.config import DockerSettings +from zenml import pipeline + +docker_settings = DockerSettings( + requirements=["my-simple-package==0.1.0"], + environment={'PIP_EXTRA_INDEX_URL': f"https://{os.environ.get('PYPI_TOKEN', '')}@my-private-pypi-server.com/{os.environ.get('PYPI_USERNAME', '')}/"} +) + +@pipeline(settings={"docker": docker_settings}) +def my_pipeline(...): + ... +``` + +Or you can also use a requirements file: + +```python +docker_settings = DockerSettings(requirements="/path/to/requirements.txt") + +@pipeline(settings={"docker": docker_settings}) +def my_pipeline(...): + ... +``` + +The `requirements.txt` file would specify the private index URL in the following +way, for example: + +``` +--extra-index-url https://YOURTOKEN@my-private-pypi-server.com/YOURUSERNAME/ +my-simple-package==0.1.0 +``` + +For information on using private PyPI repositories to share your code, see our [documentation on how to use a private PyPI repository](../customize-docker-builds/how-to-use-a-private-pypi-repository.md). + +## Best Practices + +Regardless of what you're sharing or how you're distributing it, consider these best practices: + +- Use version control for shared code repositories. + +Version control systems like Git allow teams to collaborate on code effectively. They provide a central repository where all team members can access the latest version of the shared components and libraries. + +- Implement proper access controls for private PyPI servers or shared repositories. + +To ensure the security of proprietary code and libraries, it's crucial to set up appropriate access controls. This may involve using authentication mechanisms, managing user permissions, and regularly auditing access logs. + +- Maintain clear documentation for shared components and libraries. + +Comprehensive and up-to-date documentation is essential for the smooth usage and maintenance of shared code. It should cover installation instructions, API references, usage examples, and any specific guidelines or best practices. + +- Regularly update shared libraries and communicate changes to the team. + +As the project evolves, it's important to keep shared libraries updated with the latest bug fixes, performance improvements, and feature enhancements. Establish a process for regularly updating and communicating these changes to the team. + +- Consider setting up continuous integration for shared libraries to ensure quality and compatibility. + +Continuous integration (CI) helps maintain the stability and reliability of shared components. By automatically running tests and checks on each code change, CI can catch potential issues early and ensure compatibility across different environments and dependencies. + +By leveraging these methods for sharing code and libraries, teams can +collaborate more effectively, maintain consistency across projects, and +accelerate development processes within the ZenML framework. + + +
ZenML Scarf
diff --git a/docs/book/how-to/setting-up-a-project-repository/stacks-pipelines-models.md b/docs/book/how-to/setting-up-a-project-repository/stacks-pipelines-models.md new file mode 100644 index 00000000000..1c5e278906e --- /dev/null +++ b/docs/book/how-to/setting-up-a-project-repository/stacks-pipelines-models.md @@ -0,0 +1,105 @@ +--- +description: A guide on how to organize stacks, pipelines, models, and artifacts in ZenML. +--- + +# Organizing Stacks, Pipelines, Models, and Artifacts + +In ZenML, pipelines, stacks and models form a crucial part of your project's +architecture and how you choose to use them dictates how well organized your +code and workflow is. This section will give you an overview of how to think +about these concepts and how to best utilize them. + +Before we begin, here is a quick overview of the concepts we will be discussing: + +- **Stacks**: [Stacks](../../user-guide/production-guide/understand-stacks.md) represent the configuration of tools and infrastructure that your pipelines can run on. A stack is built of multiple stack components like an orchestrator, a container registry, an artifact store, etc. Each of these components deal with one part of your workflow and work together to run your pipeline. +- **Pipelines**: [Pipelines](../../user-guide/starter-guide/create-an-ml-pipeline.md) are a series of steps that each represent a specific task in your ML workflow and are executed in a sequence that ZenML determines from your pipeline definition. Pipelines help you automate many tasks, standardize your executions, and add visibility into what your code is doing. +- **Models**: [Models](../../how-to/use-the-model-control-plane/README.md) are entities that groups pipelines, artifacts, metadata, and other crucial business data together. You may think of a ZenML Model as a "project" or a "workspace" that spans multiple pipelines. +- **Artifacts**: [Artifacts](../../user-guide/starter-guide/manage-artifacts.md) are the output of a pipeline step that you want to track and reuse across multiple pipelines. + +Understanding the relationships between stacks, pipelines, models, and artifacts is crucial for effective MLOps with ZenML. + +## How many Stacks do I need? + +A stack provides the infrastructure and tools for running pipelines. Think of a stack as a representation of your execution environment in which your pipelines are run. This comprises both the hardware like the orchestration environment and any MLOps tools you use in your workflow. This way, Stacks allow you to seamlessly transition between different environments (e.g., local, staging, production) while keeping your pipeline code consistent. + +You can learn more about organizing and managing stacks in the [Managing Stacks and Components](../../how-to/stack-deployment/README.md) guide. + +You don't need a separate stack for each pipeline; instead, you can run multiple pipelines on the same stack. A stack is meant to be created once and then reused across multiple users and pipelines. This helps in the following ways: + +- reduces the overhead of configuring your infrastructure every time you run a pipeline. +- provides a consistent environment for your pipelines to run in, promoting reproducibility. +- reduces risk of errors when it comes to what hardware and tool configurations to use. + +## How do I organize my Pipelines, Models, and Artifacts? + +Pipelines, Models, and Artifacts form the core of your ML workflow in ZenML. All of your project logic is organized around these concepts and as such, it helps to understand how they interact with each other and how to structure your code to make the most out of them. + +### Pipelines + +A pipeline typically encompasses the entire ML workflow, including data +preparation, model training, and evaluation. It's a good practice to have a +separate pipeline for different tasks like training and inference. This makes +your pipelines more modular and easier to manage. Here's some of the benefits: + +- Separation of pipelines by the nature of the task allows you to [run them independently as needed](../develop-locally/local-prod-pipeline-variants.md). For example, you might train a model in a training pipeline only once a week but run inference on new data every day. +- It becomes easier to manage and update your code as your project grows more complex. +- Different people can work on the code for the pipelines without interfering with each other. +- It helps you organize your runs better. + +### Models + +Models are what tie related pipelines together. A Model in ZenML is a collection of data artifacts, model artifacts, pipelines and metadata that can all be tied to a specific project. +As such, it is good practice to use a Model to move data between pipelines. + +Continuing with the example of a training and an inference pipeline, you can use a ZenML Model to handover the trained model from the training pipeline to the inference pipeline. The Model Control Plane allows you to set Stages for specific model versions that can help with this. + +### Artifacts + +Artifacts are the output of a pipeline step that you want to track and reuse across multiple pipelines. They can be anything from a dataset to a trained model. It is a good practice to name your artifacts appropriately to make them easy to identify and reuse. Every pipeline run that results in a unique execution of a pipeline step produces a new version of your artifact. This ensures that there's a clear history and traceability of your data and model artifacts. + +Artifacts can be tied to a Model for better organization and visibility across pipelines. You can choose to log metadata about your artifacts which will then show up in the Model Control Plane. + +## So how do I put this all together? + +![Diagram showing how Models bring together Pipelines and Artifacts](../../.gitbook/assets/model_pipeline_artifact.png) + +Let's go through a real-world example to see how we can use Stacks, Pipelines, Models, and Artifacts together. Imagine there are two people in your team working on a classification model, Bob and Alice. + +Here's how the workflow would look like with ZenML: +- They create three pipelines: one for feature engineering, one for training the model, and one for producing predictions. +- They set up a [repository for their project](../setting-up-a-project-repository/README.md) and start building their pipelines collaboratively. Let's assume Bob builds the feature engineering and training pipeline and Alice builds the inference pipeline. +- To test their pipelines locally, they both have a `default` stack with a local orchestrator and a local artifact store. This allows them to quickly iterate on their code without deploying any infrastructure or incurring any costs. +- While building the inference pipeline, Alice needs to make sure that the preprocessing step in her pipeline is the same as the one used while training. It might even involve the use of libraries that are not publicily available and she follows the [Shared Libraries and Logic for Teams](./shared_components_for_teams.md) guide to help with this. +- Bob's training pipeline produces a model artifact, which Alice's inference pipeline requires as input. It also produces other artifacts such as metrics and a model checkpoint that are logged as artifacts in the pipeline run. +- To allow easy access to model and data artifacts, they [use a ZenML Model](../../how-to/use-the-model-control-plane/associate-a-pipeline-with-a-model.md) which ties the pipelines, models and artifacts together. Now Alice can just [reference the right model name and find the model artifact she needs.](../../how-to/use-the-model-control-plane/load-artifacts-from-model.md) +- It is also critical that the right model version from the training pipeline is used in the inference pipeline. The [Model Control Plane](../../how-to/use-the-model-control-plane/README.md) helps Bob to keep track of the different versions and to easily compare them. Bob can then [promote the best performing model version to the `production` stage](../../how-to/use-the-model-control-plane/promote-a-model.md) which Alice's pipeline can then consume. +- Alice's inference pipeline produces a new artifact, in this case a new dataset containing the predictions of the model. Results can also be added as metadata to the model version, allowing easy comparisons. + +This is a very simple example, but it shows how you can use ZenML to structure your ML workflow. You can use the same principles for more complex workflows. + +## Rules of Thumb + +Here are some general guidelines to help you organize your ZenML projects effectively: + +### Models +- Create one Model per distinct machine learning use-case or business problem +- Use Models to group related pipelines, artifacts, and metadata together +- Leverage the Model Control Plane to manage model versions and stages (e.g., staging, production) + +### Stacks +- Maintain separate stacks for different environments (development, staging, production) +- Share production and staging stacks across teams to ensure consistency +- Keep local development stacks simple for quick iterations + +### Naming and Organization +- Use consistent naming conventions for pipelines, artifacts, and models +- Leverage tags to organize and filter resources (e.g., `environment:production`, `team:fraud-detection`) +- Document stack configurations and pipeline dependencies +- Keep pipeline code modular and reusable across different environments + +Following these guidelines will help maintain a clean and scalable MLOps workflow as your project grows. + + +
ZenML Scarf
+ + diff --git a/docs/book/how-to/setting-up-a-project-repository/using-project-templates.md b/docs/book/how-to/setting-up-a-project-repository/using-project-templates.md index 7db70c608cd..4b5e7a0e525 100644 --- a/docs/book/how-to/setting-up-a-project-repository/using-project-templates.md +++ b/docs/book/how-to/setting-up-a-project-repository/using-project-templates.md @@ -14,7 +14,7 @@ What would you need to get a quick understanding of the ZenML framework and star Do you have a personal project powered by ZenML that you would like to see here? At ZenML, we are looking for design partnerships and collaboration to help us better understand the real-world scenarios in which MLOps is being used and to build the best possible experience for our users. If you are interested in sharing all or parts of your project with us in the form of a ZenML project template, please [join our Slack](https://zenml.io/slack/) and leave us a message! {% endhint %} -## Generating project from a project template +## Using a project template First, to use the templates, you need to have ZenML and its `templates` extras installed: @@ -22,6 +22,13 @@ First, to use the templates, you need to have ZenML and its `templates` extras i pip install zenml[templates] ``` +{% hint style="warning" %} +Note that these templates are not the same thing as the templates used for +triggering a pipeline (from the dashboard or via the Python SDK). Those are +known as 'Run Templates' and you can read more about them here. +{% endhint %} + Now, you can generate a project from one of the existing templates by using the `--template` flag with the `zenml init` command: ```bash @@ -36,47 +43,5 @@ zenml init --template --template-with-defaults # example: zenml init --template e2e_batch --template-with-defaults ``` -## Creating your own ZenML template - -Creating your own ZenML template is a great way to standardize and share your ML workflows across different projects or teams. ZenML uses [Copier](https://copier.readthedocs.io/en/stable/) to manage its project templates. Copier is a library that allows you to generate projects from templates. It's simple, versatile, and powerful. - -Here's a step-by-step guide on how to create your own ZenML template: - -1. **Create a new repository for your template.** This will be the place where you store all the code and configuration files for your template. -2. **Define your ML workflows as ZenML steps and pipelines.** You can start by copying the code from one of the existing ZenML templates (like the [starter template](https://github.com/zenml-io/template-starter)) and modifying it to fit your needs. -3. **Create a `copier.yml` file.** This file is used by Copier to define the template's parameters and their default values. You can learn more about this config file [in the copier docs](https://copier.readthedocs.io/en/stable/creating/). -4. **Test your template.** You can use the `copier` command-line tool to generate a new project from your template and check if everything works as expected: - -```bash -copier copy https://github.com/your-username/your-template.git your-project -``` - -Replace `https://github.com/your-username/your-template.git` with the URL of your template repository, and `your-project` with the name of the new project you want to create. - -5. **Use your template with ZenML.** Once your template is ready, you can use it with the `zenml init` command: - -```bash -zenml init --template https://github.com/your-username/your-template.git -``` - -Replace `https://github.com/your-username/your-template.git` with the URL of your template repository. - -If you want to use a specific version of your template, you can use the `--template-tag` option to specify the git tag of the version you want to use: - -```bash -zenml init --template https://github.com/your-username/your-template.git --template-tag v1.0.0 -``` - -Replace `v1.0.0` with the git tag of the version you want to use. - -That's it! Now you have your own ZenML project template that you can use to quickly set up new ML projects. Remember to keep your template up-to-date with the latest best practices and changes in your ML workflows. - -Our [Production Guide](../../user-guide/production-guide/README.md) documentation is built around the `E2E Batch` project template codes. Most examples will be based on it, so we highly recommend you to install the `e2e_batch` template with `--template-with-defaults` flag before diving deeper into this documentation section, so you can follow this guide along using your own local environment. - -```bash -mkdir e2e_batch -cd e2e_batch -zenml init --template e2e_batch --template-with-defaults -``` - -
ZenML Scarf
+ +
ZenML Scarf
diff --git a/docs/book/how-to/training-with-gpus/accelerate-distributed-training.md b/docs/book/how-to/training-with-gpus/accelerate-distributed-training.md index 3177b5cdc18..4047e781413 100644 --- a/docs/book/how-to/training-with-gpus/accelerate-distributed-training.md +++ b/docs/book/how-to/training-with-gpus/accelerate-distributed-training.md @@ -49,7 +49,7 @@ The `run_with_accelerate` decorator accepts various arguments to configure your 3. If `run_with_accelerate` is misused, it will raise a `RuntimeError` with a helpful message explaining the correct usage. {% hint style="info" %} -To see a full example where Accelerate is used within a ZenML pipeline, check out our llm-lora-finetuning project which leverages the distributed training functionalities while finetuning an LLM. +To see a full example where Accelerate is used within a ZenML pipeline, check out our [llm-lora-finetuning](https://github.com/zenml-io/zenml-projects/blob/main/llm-lora-finetuning/README.md) project which leverages the distributed training functionalities while finetuning an LLM. {% endhint %} ## Ensure your container is Accelerate-ready @@ -111,4 +111,4 @@ If you're new to distributed training or encountering issues, please [connect wi By leveraging the Accelerate integration in ZenML, you can easily scale your training processes and make the most of your available hardware resources, all while maintaining the structure and benefits of your ZenML pipelines. -
ZenML Scarf
\ No newline at end of file +
ZenML Scarf
diff --git a/docs/book/toc.md b/docs/book/toc.md index ae2760196da..5984cc91fc2 100644 --- a/docs/book/toc.md +++ b/docs/book/toc.md @@ -63,10 +63,14 @@ ## How-To -* [😸 Set up a project repository](how-to/setting-up-a-project-repository/README.md) +* [😸 Set up a ZenML project](how-to/setting-up-a-project-repository/README.md) + * [Set up a repository](how-to/setting-up-a-project-repository/set-up-repository.md) * [Connect your git repository](how-to/setting-up-a-project-repository/connect-your-git-repository.md) * [Project templates](how-to/setting-up-a-project-repository/using-project-templates.md) - * [Best practices](how-to/setting-up-a-project-repository/best-practices.md) + * [Create your own template](how-to/setting-up-a-project-repository/create-your-own-template.md) + * [Shared components for teams](how-to/setting-up-a-project-repository/shared-components-for-teams.md) + * [Stacks, pipelines and models](how-to/setting-up-a-project-repository/stacks-pipelines-models.md) + * [Access management](how-to/setting-up-a-project-repository/access-management.md) * [⛓️ Build a pipeline](how-to/build-pipelines/README.md) * [Use pipeline/step parameters](how-to/build-pipelines/use-pipeline-step-parameters.md) * [Configuring a pipeline at runtime](how-to/build-pipelines/configuring-a-pipeline-at-runtime.md) @@ -104,6 +108,7 @@ * [Docker settings on a step](how-to/customize-docker-builds/docker-settings-on-a-step.md) * [Use a prebuilt image for pipeline execution](how-to/customize-docker-builds/use-a-prebuilt-image.md) * [Specify pip dependencies and apt packages](how-to/customize-docker-builds/specify-pip-dependencies-and-apt-packages.md) + * [How to use a private PyPI repository](how-to/customize-docker-builds/how-to-use-a-private-pypi-repository.md) * [Use your own Dockerfiles](how-to/customize-docker-builds/use-your-own-docker-files.md) * [Which files are built into the image](how-to/customize-docker-builds/which-files-are-built-into-the-image.md) * [How to reuse builds](how-to/customize-docker-builds/how-to-reuse-builds.md) diff --git a/docs/book/user-guide/llmops-guide/finetuning-embeddings/finetuning-embeddings.md b/docs/book/user-guide/llmops-guide/finetuning-embeddings/finetuning-embeddings.md index 6e68ec9a387..23167ed3e9d 100644 --- a/docs/book/user-guide/llmops-guide/finetuning-embeddings/finetuning-embeddings.md +++ b/docs/book/user-guide/llmops-guide/finetuning-embeddings/finetuning-embeddings.md @@ -35,7 +35,7 @@ interactive and engaging UI. Both libraries can be used individually but they work better together. We'll showcase their use via ZenML pipelines. To follow along with the example explained in this guide, please follow the -instructions in [the `llm-complete-guide` repository](https://github.com/zenml-io/zenml-projects/llm-complete-guide/README.md) where the full code is also +instructions in [the `llm-complete-guide` repository](https://github.com/zenml-io/zenml-projects/tree/main/llm-complete-guide) where the full code is also available. This specific section on embeddings finetuning can be run locally or using cloud compute as you prefer. diff --git a/docs/book/user-guide/llmops-guide/rag/storing-embeddings-in-a-vector-database.md b/docs/book/user-guide/llmops-guide/rag/storing-embeddings-in-a-vector-database.md index bdeea29f4d5..2b169636080 100644 --- a/docs/book/user-guide/llmops-guide/rag/storing-embeddings-in-a-vector-database.md +++ b/docs/book/user-guide/llmops-guide/rag/storing-embeddings-in-a-vector-database.md @@ -22,7 +22,7 @@ options. {% hint style="info" %} For more information on how to set up a PostgreSQL database to follow along with -this guide, please see the instructions in the repository which show how to set +this guide, please [see the instructions in the repository](https://github.com/zenml-io/zenml-projects/tree/main/llm-complete-guide) which show how to set up a PostgreSQL database using Supabase. {% endhint %} diff --git a/docs/mocked_libs.json b/docs/mocked_libs.json index 796dbeea0a1..605258569b9 100644 --- a/docs/mocked_libs.json +++ b/docs/mocked_libs.json @@ -229,10 +229,7 @@ "xgboost", "argilla", "argilla.client", - "argilla.client.client", - "argilla.client.sdk", - "argilla.client.sdk.commons", - "argilla.client.sdk.commons.errors", + "argilla._exceptions._api", "peewee", "prodigy", "prodigy.components", diff --git a/examples/quickstart/configs/training_aws.yaml b/examples/quickstart/configs/training_aws.yaml index 55819ebaf83..3813443a075 100644 --- a/examples/quickstart/configs/training_aws.yaml +++ b/examples/quickstart/configs/training_aws.yaml @@ -3,7 +3,7 @@ enable_cache: True # Environment configuration settings: docker: - parent_image: "715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml-public-pipelines:quickstart-0.67.0-py3.11-aws" + parent_image: "715803424590.dkr.ecr.eu-central-1.amazonaws.com/zenml-public-pipelines:quickstart-0.68.0-py3.11-aws" skip_build: True # If you switch this to False remove the parent_image requirements: requirements.txt orchestrator.sagemaker: diff --git a/examples/quickstart/configs/training_azure.yaml b/examples/quickstart/configs/training_azure.yaml index 52d8d7b1323..035ccb76b6d 100644 --- a/examples/quickstart/configs/training_azure.yaml +++ b/examples/quickstart/configs/training_azure.yaml @@ -3,7 +3,7 @@ enable_cache: True # Environment configuration settings: docker: - parent_image: "zenmldocker/zenml-public-pipelines:quickstart-0.67.0-py3.11-azure" + parent_image: "zenmldocker/zenml-public-pipelines:quickstart-0.68.0-py3.11-azure" skip_build: True requirements: requirements.txt # Uncomment the following lines to specify the accelerator for your azureml orchestrator diff --git a/examples/quickstart/configs/training_gcp.yaml b/examples/quickstart/configs/training_gcp.yaml index 8e712198870..6c6a824be9d 100644 --- a/examples/quickstart/configs/training_gcp.yaml +++ b/examples/quickstart/configs/training_gcp.yaml @@ -3,7 +3,7 @@ enable_cache: True # Environment configuration settings: docker: - parent_image: "zenmldocker/zenml-public-pipelines:quickstart-0.67.0-py3.11-gcp" + parent_image: "zenmldocker/zenml-public-pipelines:quickstart-0.68.0-py3.11-gcp" skip_build: True requirements: requirements.txt # Uncomment the following two lines to specify the accelerator for your vertex orchestrator diff --git a/examples/quickstart/quickstart.ipynb b/examples/quickstart/quickstart.ipynb index 98e1b92c080..44f9ba3fb5d 100644 --- a/examples/quickstart/quickstart.ipynb +++ b/examples/quickstart/quickstart.ipynb @@ -1,756 +1,756 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "63ab391a", - "metadata": {}, - "source": [ - "# ZenML Quickstart: Bridging Local Development and Cloud Deployment\n", - "\n", - "This repository demonstrates how ZenML streamlines the transition of machine learning workflows from local environments to cloud-scale operations.\n", - "\n", - "## Key advantages:\n", - "\n", - "Deploy to major cloud providers with minimal code changes\n", - "\n", - "* Connect directly to your existing infrastructure\n", - "* Bridge the gap between ML and Ops teams\n", - "* Gain deep insights into pipeline metadata via the ZenML Dashboard\n", - "\n", - "Unlike traditional MLOps tools, ZenML offers unparalleled flexibility and control. It integrates seamlessly with your infrastructure, allowing both ML and Ops teams to collaborate effectively without compromising on their specific requirements.\n", - "\n", - "The notebook guides you through adapting local code for cloud deployment, showcasing ZenML's ability to enhance workflow efficiency while maintaining reproducibility and auditability in production.\n", - "\n", - "Ready to unify your ML development and operations? Let's begin. The diagram below \n", - "describes what we'll show you in this example.\n", - "\n", - "\"Pipelines\n", - "\n", - "1) We have done some of the experimenting for you already and created a simple finetuning pipeline for a text-to-text task.\n", - "\n", - "2) We will run this pipeline on your machine and a verify that everything works as expected.\n", - "\n", - "3) Now we'll connect ZenML to your infrastructure and configure everything.\n", - "\n", - "4) Finally, we are ready to run our code remotely.\n", - "\n", - "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!" - ] - }, - { - "cell_type": "markdown", - "id": "8f466b16", - "metadata": {}, - "source": [ - "## Run on Colab\n", - "\n", - "You can use Google Colab to run this notebook, no local installation\n", - "required!\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n", - "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/quickstart/quickstart.ipynb)" - ] - }, - { - "cell_type": "markdown", - "id": "66b2977c", - "metadata": {}, - "source": [ - "# 👶 Step 0. Install Requirements\n", - "\n", - "Let's install ZenML and all requirement to get started." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4385bdb-6cc8-4a6b-8de2-a7fd658556aa", - "metadata": {}, - "outputs": [], - "source": [ - "# Choose a cloud provider - at the end of this notebook you will run a pipeline on this cloud provider\n", - "CLOUD_PROVIDER = None # Set this to \"GCP\", \"AWS\" or \"AZURE\" as needed\n", - "\n", - "\n", - "def in_google_colab() -> bool:\n", - " \"\"\"Checks wether this notebook is run in google colab.\"\"\"\n", - " try:\n", - " import google.colab # noqa\n", - "\n", - " return True\n", - "\n", - " except ModuleNotFoundError:\n", - " return False\n", - "\n", - "\n", - "if in_google_colab():\n", - " # Pull required modules from this example\n", - " !git clone -b main https://github.com/zenml-io/zenml\n", - " !cp -r zenml/examples/quickstart/* .\n", - " !rm -rf zenml\n", - "\n", - "\n", - "# Common imports and setup\n", - "if CLOUD_PROVIDER.lower() == \"gcp\":\n", - " !pip install -r requirements_gcp.txt\n", - "\n", - "elif CLOUD_PROVIDER.lower() == \"aws\":\n", - " !pip install -r requirements_aws.txt\n", - "\n", - "elif CLOUD_PROVIDER.lower() == \"azure\":\n", - " !pip install -r requirements_azure.txt\n", - "\n", - "else: # In this case the second half of the notebook won't work for you\n", - " !pip install -r requirements.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f76f562e", - "metadata": {}, - "outputs": [], - "source": [ - "# Restart Kernel to ensure all libraries are properly loaded\n", - "import IPython\n", - "\n", - "IPython.Application.instance().kernel.do_shutdown(restart=True)" - ] - }, - { - "cell_type": "markdown", - "id": "3b044374", - "metadata": {}, - "source": [ - "\n", - "Please wait for the installation to complete before running subsequent cells. At\n", - "the end of the installation, the notebook kernel will restart." - ] - }, - { - "cell_type": "markdown", - "id": "966ce581", - "metadata": {}, - "source": [ - "## ☁️ Step 1: Connect to your ZenML Server\n", - "To run this quickstart you need to connect to a ZenML Server. You can deploy it [yourself on your own infrastructure](https://docs.zenml.io/getting-started/deploying-zenml) or try it out for free, no credit-card required in our [ZenML Pro managed service](https://zenml.io/pro)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2587315", - "metadata": {}, - "outputs": [], - "source": [ - "zenml_server_url = (\n", - " None # INSERT URL TO SERVER HERE in the form \"https://URL_TO_SERVER\"\n", - ")\n", - "\n", - "assert zenml_server_url\n", - "\n", - "!zenml connect --url $zenml_server_url" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f78a2f42-7a53-45f1-b45b-77bfc3762260", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize ZenML and define the root for imports and docker builds\n", - "!zenml init\n", - "\n", - "!zenml stack set default" - ] - }, - { - "cell_type": "markdown", - "id": "35e48460", - "metadata": {}, - "source": [ - "## 🥇 Step 2: Build and run your first pipeline\n", - "\n", - "In this quickstart we'll be working with a small dataset of sentences in old english paired with more modern formulations. The task is a text-to-text transformation.\n", - "\n", - "When you're getting started with a machine learning problem you'll want to break down your code into distinct functions that load your data, bring it into the correct shape and finally produce a model. This is the experimentation phase where we try to massage our data into the right format and feed it into our model training.\n", - "\n", - "\"Experimentation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cd974d1", - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from datasets import Dataset\n", - "from typing_extensions import Annotated\n", - "\n", - "from zenml import step\n", - "\n", - "PROMPT = \"\" # In case you want to also use a prompt you can set it here\n", - "\n", - "\n", - "def read_data_from_url(url):\n", - " \"\"\"Reads data from url.\n", - "\n", - " Assumes the individual data points are linebreak separated\n", - " and input, targets are separated by a `|` pipe.\n", - " \"\"\"\n", - " inputs = []\n", - " targets = []\n", - "\n", - " response = requests.get(url)\n", - " response.raise_for_status() # Raise an exception for bad responses\n", - "\n", - " for line in response.text.splitlines():\n", - " old, modern = line.strip().split(\"|\")\n", - " inputs.append(f\"{PROMPT}{old}\")\n", - " targets.append(modern)\n", - "\n", - " return {\"input\": inputs, \"target\": targets}\n", - "\n", - "\n", - "@step\n", - "def load_data(\n", - " data_url: str,\n", - ") -> Annotated[Dataset, \"full_dataset\"]:\n", - " \"\"\"Load and prepare the dataset.\"\"\"\n", - "\n", - " # Fetch and process the data\n", - " data = read_data_from_url(data_url)\n", - "\n", - " # Convert to Dataset\n", - " return Dataset.from_dict(data)" - ] - }, - { - "cell_type": "markdown", - "id": "b6286b67", - "metadata": {}, - "source": [ - "ZenML is built in a way that allows you to experiment with your data and build\n", - "your pipelines one step at a time. If you want to call this function to see how it\n", - "works, you can just call it directly. Here we take a look at the first few rows\n", - "of your training dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d838e2ea", - "metadata": {}, - "outputs": [], - "source": [ - "data_source = \"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\"\n", - "\n", - "dataset = load_data(data_url=data_source)\n", - "print(f\"Input: {dataset['input'][1]} - Target: {dataset['target'][1]}\")" - ] - }, - { - "cell_type": "markdown", - "id": "28c05291", - "metadata": {}, - "source": [ - "Everything looks as we'd expect and the input/output pair looks to be in the right format 🥳.\n", - "\n", - "For the sake of this quickstart we have prepared a few steps in the steps-directory. We'll now connect these together into a pipeline. To do this simply plug multiple steps together through their inputs and outputs. Then just add the `@pipeline` decorator to the function that connects the steps." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b50a9537", - "metadata": {}, - "outputs": [], - "source": [ - "import materializers\n", - "from steps import (\n", - " evaluate_model,\n", - " load_data,\n", - " split_dataset,\n", - " test_model,\n", - " tokenize_data,\n", - " train_model,\n", - ")\n", - "from steps.model_trainer import T5_Model\n", - "\n", - "from zenml import Model, pipeline\n", - "from zenml.client import Client\n", - "\n", - "assert materializers\n", - "\n", - "# Initialize the ZenML client to fetch objects from the ZenML Server\n", - "client = Client()\n", - "\n", - "Client().activate_stack(\n", - " \"default\"\n", - ") # We will start by using the default stack which is local\n", - "\n", - "model_name = \"YeOldeEnglishTranslator\"\n", - "model = Model(\n", - " name=\"YeOldeEnglishTranslator\",\n", - " description=\"Model to translate from old to modern english\",\n", - " tags=[\"quickstart\", \"llm\", \"t5\"],\n", - ")\n", - "\n", - "\n", - "@pipeline(model=model)\n", - "def english_translation_pipeline(\n", - " data_url: str,\n", - " model_type: T5_Model,\n", - " per_device_train_batch_size: int,\n", - " gradient_accumulation_steps: int,\n", - " dataloader_num_workers: int,\n", - " num_train_epochs: int = 5,\n", - "):\n", - " \"\"\"Define a pipeline that connects the steps.\"\"\"\n", - " full_dataset = load_data(data_url)\n", - " tokenized_dataset, tokenizer = tokenize_data(\n", - " dataset=full_dataset, model_type=model_type\n", - " )\n", - " tokenized_train_dataset, tokenized_eval_dataset, tokenized_test_dataset = (\n", - " split_dataset(\n", - " tokenized_dataset,\n", - " train_size=0.7,\n", - " test_size=0.1,\n", - " eval_size=0.2,\n", - " subset_size=0.1, # We use a subset of the dataset to speed things up\n", - " random_state=42,\n", - " )\n", - " )\n", - " model = train_model(\n", - " tokenized_dataset=tokenized_train_dataset,\n", - " model_type=model_type,\n", - " num_train_epochs=num_train_epochs,\n", - " per_device_train_batch_size=per_device_train_batch_size,\n", - " gradient_accumulation_steps=gradient_accumulation_steps,\n", - " dataloader_num_workers=dataloader_num_workers,\n", - " )\n", - " evaluate_model(model=model, tokenized_dataset=tokenized_eval_dataset)\n", - " test_model(\n", - " model=model,\n", - " tokenized_test_dataset=tokenized_test_dataset,\n", - " tokenizer=tokenizer,\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "7cd73c23", - "metadata": {}, - "source": [ - "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n", - "pipeline function itself:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e0aa9af", - "metadata": {}, - "outputs": [], - "source": [ - "# Run the pipeline and configure some parameters at runtime\n", - "pipeline_run = english_translation_pipeline(\n", - " data_url=\"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\",\n", - " model_type=\"t5-small\",\n", - " num_train_epochs=1, # to make this demo fast, we start at 1 epoch\n", - " per_device_train_batch_size=2,\n", - " gradient_accumulation_steps=4,\n", - " dataloader_num_workers=4,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "6c42078a", - "metadata": {}, - "source": [ - "As you can see the pipeline has run successfully. Here is a sneak-peak of the dashboard view into this pipeline. The URL for this view can be found in the logs.\n", - "\n", - "\"Dashboard\n", - "\n", - "This isn't all that the ZenML Dashboard has to offer, if you navigate over to the ZenML Model control plane, you'll also find the produced model along with a lot of important metadata.\n", - "\n", - "\"Model\n", - "\n", - "Here you'll also see a collection of example Input-Output pairs. As you can see, the model is currently not performing its task well." - ] - }, - { - "cell_type": "markdown", - "id": "a037f09d", - "metadata": {}, - "source": [ - "We can now access the trained model and it's tokenizer from the ZenML Model Control Plane. This will allow us to interact with the model directly." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "53e514ac-1a0a-49a0-b8a4-e33cee12c765", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "\n", - "# load the model object\n", - "model = client.get_model_version(model_name).get_model_artifact(\"model\").load()\n", - "tokenizer = (\n", - " client.get_model_version(model_name).get_artifact(\"tokenizer\").load()\n", - ")\n", - "\n", - "test_text = \"I do desire we may be better strangers\" # Insert your own test sentence here\n", - "\n", - "input_ids = tokenizer(\n", - " test_text,\n", - " return_tensors=\"pt\",\n", - " max_length=128,\n", - " truncation=True,\n", - " padding=\"max_length\",\n", - ").input_ids\n", - "\n", - "with torch.no_grad():\n", - " outputs = model.generate(\n", - " input_ids,\n", - " max_length=128,\n", - " num_return_sequences=1,\n", - " no_repeat_ngram_size=2,\n", - " top_k=50,\n", - " top_p=0.95,\n", - " temperature=0.7,\n", - " )\n", - "\n", - "decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", - "\n", - "print(decoded_output)" - ] - }, - { - "cell_type": "markdown", - "id": "1e653c7a-4073-424e-8a59-c69f49526b96", - "metadata": {}, - "source": [ - "## Lets recap what we've done so far\n", - "\n", - "We created a modular pipeline, this pipeline is modularly constructed from different steps. We have shown that this pipeline runs locally.\n", - "\n", - "As expected, the modcel does not yet solve its task. To train a model that can solve our task well, we would have to train a larger model for longer. For this, we'll need to move away from our local environment. " - ] - }, - { - "cell_type": "markdown", - "id": "8c28b474", - "metadata": {}, - "source": [ - "# ⌚ Step 3: Scale it up in the cloud" - ] - }, - { - "cell_type": "markdown", - "id": "a791b32b-f6be-4ae2-867c-5e628f363858", - "metadata": {}, - "source": [ - "Our last section confirmed to us, that the pipeline works. Let's now run the pipeline in the environment of your choice.\n", - "\n", - "For you to be able to try this step, you will need to have access to a cloud environment (AWS, GCP, AZURE). ZenML wraps around all the major cloud providers and orchestration tools and lets you easily deploy your code onto them.\n", - "\n", - "To do this lets head over to the `Stack` section of your ZenML Dashboard. Here you'll be able to either connect to an existing or deploy a new environment. Choose on of the options presented to you there and come back when you have a stack ready to go. \n", - "\n", - "\"Stack\n", - "\n", - "Then proceed to the section below. Also be sure that you are running with a remote ZenML server (see Step 1 above)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e7a90b5-78c3-4019-8a81-671b5d62d470", - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.environment import Environment\n", - "\n", - "# Set the cloud provider here\n", - "CLOUD_PROVIDER = None # Set this to \"GCP\", \"AWS\" or \"AZURE\"\n", - "assert CLOUD_PROVIDER\n", - "\n", - "# Set the name of the stack that you created within zenml\n", - "stack_name = None # Set this\n", - "assert stack_name # Set your stack, follow instruction above\n", - "\n", - "from zenml import pipeline\n", - "from zenml.client import Client\n", - "from zenml.config import DockerSettings\n", - "\n", - "settings = {}\n", - "\n", - "# Common imports and setup\n", - "if CLOUD_PROVIDER.lower() == \"gcp\":\n", - " parent_image = (\n", - " \"zenmldocker/zenml-public-pipelines:quickstart-0.67.0-py3.11-gcp\"\n", - " )\n", - " skip_build = True\n", - "\n", - "elif CLOUD_PROVIDER.lower() == \"aws\":\n", - " from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import (\n", - " SagemakerOrchestratorSettings,\n", - " )\n", - "\n", - " parent_image = \"339712793861.dkr.ecr.eu-central-1.amazonaws.com/zenml-public-pipelines:quickstart-0.67.0-py3.11-aws\"\n", - " skip_build = True # if you switch this to False, you need to remove the parent image\n", - "\n", - " settings[\"orchestrator.sagemaker\"] = SagemakerOrchestratorSettings(\n", - " instance_type=\"ml.m5.4xlarge\"\n", - " )\n", - "\n", - "elif CLOUD_PROVIDER.lower() == \"azure\":\n", - " parent_image = (\n", - " \"zenmldocker/zenml-public-pipelines:quickstart-0.67.0-py3.11-azure\"\n", - " )\n", - " skip_build = True\n", - "\n", - "Client().activate_stack(stack_name)\n", - "\n", - "data_source = \"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\"\n", - "\n", - "# We've prebuilt a docker image for this quickstart to speed things up, feel free to delete the DockerSettings to build from scratch\n", - "settings[\"docker\"] = DockerSettings(\n", - " parent_image=parent_image, skip_build=skip_build\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "8a5cd031-0661-4073-a5ea-6aac5f989212", - "metadata": {}, - "source": [ - "If you are in a google colab you might need to rerun the cell above a second time after the runtime restarted." - ] - }, - { - "cell_type": "markdown", - "id": "95f17b7a-5a82-4975-b9bd-6a63fbb97a68", - "metadata": {}, - "source": [ - "## 🚀 Ready to launch" - ] - }, - { - "cell_type": "markdown", - "id": "df14f30c-9a8e-46ca-ba44-cf16ea715dac", - "metadata": {}, - "source": [ - "We now have configured a ZenML stack that represents your very own cloud infrastructure. For the next pipeline run, we'll be training the same t5 model (`t5_small`) on your own infrastrucutre.\n", - "\n", - "Note: The whole process may take a bit longer the first time around, as your pipeline code needs to be built into docker containers to be run in the orchestration environment of your stack. Any consecutive run of the pipeline, even with different parameters set, will not take as long again thanks to docker caching." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfad9bd5", - "metadata": {}, - "outputs": [], - "source": [ - "# In the case that we are within a colab environment we want to remove\n", - "# these folders\n", - "if Environment.in_google_colab():\n", - " !rm -rf results\n", - " !rm -rf sample_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "12e758fe-6ea3-42ff-bea8-33953135bf6b", - "metadata": {}, - "outputs": [], - "source": [ - "from pipelines import (\n", - " english_translation_pipeline,\n", - ")\n", - "\n", - "from zenml import Model\n", - "\n", - "model_name = \"YeOldeEnglishTranslator\"\n", - "model = Model(\n", - " name=\"YeOldeEnglishTranslator\",\n", - ")\n", - "\n", - "pipeline_run = english_translation_pipeline.with_options(\n", - " settings=settings, model=model\n", - ")(\n", - " data_url=\"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\",\n", - " model_type=\"t5-small\",\n", - " num_train_epochs=2,\n", - " per_device_train_batch_size=4,\n", - " gradient_accumulation_steps=4,\n", - " dataloader_num_workers=0, # Some cloud environment don't support multiple of these\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "c7eef2c6-6dfb-4b67-9883-594a0df20173", - "metadata": {}, - "source": [ - "You did it! You build a pipeline locally, verified that all its parts work well together and now are running it on a production environment\n", - "\n", - "\"Pipeline\n", - "\n", - "Depending on the backend you chose, you can also go inspect your run in the orchestrator of your choice. Here is an example on GCP Vertex:\n", - "\n", - "\"Pipeline" - ] - }, - { - "cell_type": "markdown", - "id": "1a03054e-8b3e-4edb-9d87-82ae51693d2d", - "metadata": {}, - "source": [ - "## Adding Accelerators\n", - "Each of the cloud providers allows users to add accelerators to their serverless offerings. Here's what you need to add to the pipeline settings in order to unlock gpus. Keep in mind, that you might have to increase your quotas within the cloud providers." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2f7c3c5-dcde-4824-a012-f3da224cb8d8", - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.config import ResourceSettings\n", - "\n", - "if CLOUD_PROVIDER == \"GCP\":\n", - " from zenml.integrations.gcp.flavors.vertex_orchestrator_flavor import (\n", - " VertexOrchestratorSettings,\n", - " )\n", - "\n", - " # find out about your options here: https://docs.zenml.io/stack-components/orchestrators/vertex#additional-configuration\n", - "\n", - " english_translation_pipeline.with_options(\n", - " settings={\n", - " \"orchestrator.vertex\": VertexOrchestratorSettings(\n", - " node_selector_constraint=(\n", - " \"cloud.google.com/gke-accelerator\",\n", - " \"NVIDIA_TESLA_P4\",\n", - " )\n", - " ),\n", - " \"resources\": ResourceSettings(memory=\"32GB\", gpu_count=1),\n", - " }\n", - " )\n", - "if CLOUD_PROVIDER == \"AWS\":\n", - " from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import (\n", - " SagemakerOrchestratorSettings,\n", - " )\n", - "\n", - " # find out your options here: https://docs.zenml.io/stack-components/orchestrators/sagemaker#configuration-at-pipeline-or-step-level\n", - "\n", - " english_translation_pipeline.with_options(\n", - " settings={\n", - " \"orchestrator.sagemaker\": SagemakerOrchestratorSettings(\n", - " instance_type=\"ml.p2.xlarge\"\n", - " )\n", - " }\n", - " )\n", - "if CLOUD_PROVIDER == \"AZURE\":\n", - " from zenml.integrations.azure.flavors import AzureMLOrchestratorSettings\n", - "\n", - " # find out your options here: https://docs.zenml.io/stack-components/orchestrators/azureml#settings\n", - " # The quickest way is probably to configure a compute-instance in azure ml. This instance should contain\n", - " # a gpu. Then specify the name of the compute instance here.\n", - "\n", - " compute_name = None # Insert the name of your compute instance here\n", - "\n", - " english_translation_pipeline.with_options(\n", - " settings={\n", - " \"orchestrator.azureml\": AzureMLOrchestratorSettings(\n", - " mode=\"compute-instance\", compute_name=compute_name\n", - " )\n", - " }\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "8231677c-1fd6-4ec3-8c8c-47fd9406072e", - "metadata": {}, - "source": [ - "## Now it's up to you" - ] - }, - { - "cell_type": "markdown", - "id": "90c31e0d-dfab-4692-a406-0dc439f25443", - "metadata": {}, - "source": [ - "You can now start worrying about making the model actually work well on our toy example or any other dataset you like.\n", - "\n", - "Here are some things that you could do:\n", - "\n", - "* Iterate on the training data and its tokenization\n", - "* You can switch out the model itself. Instead of `model_type=\"t5_small\"` you could use `model_type=\"t5_large\"` for example\n", - "* You can train for longer by increasing the `num_train_epochs=xxx`. In order to speed this up you can also add accelerators to your orchestrators. Learn more about this in the section below.\n", - "\n", - "No matter what avenue you choose to actually make the model work, we would love to see how you did it, so please reach out and share your solution with us either on [**Slack Community**](https://zenml.io/slack) or through our email hello@zenml.io." - ] - }, - { - "cell_type": "markdown", - "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", - "metadata": {}, - "source": [ - "## Further exploration\n", - "\n", - "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n", - "about the capabilities of ZenML. For example, you might want to:\n", - "\n", - "- [Deploy ZenML](https://docs.zenml.io/user-guide/production-guide/connect-deployed-zenml) to collaborate with your colleagues.\n", - "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guide/production-guide/cloud-stack).\n", - "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n", - "\n", - "## What next?\n", - "\n", - "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n", - "* If you want to quickly get started with ZenML, check out [ZenML Pro](https://zenml.io/pro)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c560354d-9e78-4061-aaff-2e6213229911", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "cells": [ + { + "cell_type": "markdown", + "id": "63ab391a", + "metadata": {}, + "source": [ + "# ZenML Quickstart: Bridging Local Development and Cloud Deployment\n", + "\n", + "This repository demonstrates how ZenML streamlines the transition of machine learning workflows from local environments to cloud-scale operations.\n", + "\n", + "## Key advantages:\n", + "\n", + "Deploy to major cloud providers with minimal code changes\n", + "\n", + "* Connect directly to your existing infrastructure\n", + "* Bridge the gap between ML and Ops teams\n", + "* Gain deep insights into pipeline metadata via the ZenML Dashboard\n", + "\n", + "Unlike traditional MLOps tools, ZenML offers unparalleled flexibility and control. It integrates seamlessly with your infrastructure, allowing both ML and Ops teams to collaborate effectively without compromising on their specific requirements.\n", + "\n", + "The notebook guides you through adapting local code for cloud deployment, showcasing ZenML's ability to enhance workflow efficiency while maintaining reproducibility and auditability in production.\n", + "\n", + "Ready to unify your ML development and operations? Let's begin. The diagram below \n", + "describes what we'll show you in this example.\n", + "\n", + "\"Pipelines\n", + "\n", + "1) We have done some of the experimenting for you already and created a simple finetuning pipeline for a text-to-text task.\n", + "\n", + "2) We will run this pipeline on your machine and a verify that everything works as expected.\n", + "\n", + "3) Now we'll connect ZenML to your infrastructure and configure everything.\n", + "\n", + "4) Finally, we are ready to run our code remotely.\n", + "\n", + "Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows!" + ] + }, + { + "cell_type": "markdown", + "id": "8f466b16", + "metadata": {}, + "source": [ + "## Run on Colab\n", + "\n", + "You can use Google Colab to run this notebook, no local installation\n", + "required!\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](\n", + "https://colab.research.google.com/github/zenml-io/zenml/blob/main/examples/quickstart/quickstart.ipynb)" + ] + }, + { + "cell_type": "markdown", + "id": "66b2977c", + "metadata": {}, + "source": [ + "# 👶 Step 0. Install Requirements\n", + "\n", + "Let's install ZenML and all requirement to get started." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4385bdb-6cc8-4a6b-8de2-a7fd658556aa", + "metadata": {}, + "outputs": [], + "source": [ + "# Choose a cloud provider - at the end of this notebook you will run a pipeline on this cloud provider\n", + "CLOUD_PROVIDER = None # Set this to \"GCP\", \"AWS\" or \"AZURE\" as needed\n", + "\n", + "\n", + "def in_google_colab() -> bool:\n", + " \"\"\"Checks wether this notebook is run in google colab.\"\"\"\n", + " try:\n", + " import google.colab # noqa\n", + "\n", + " return True\n", + "\n", + " except ModuleNotFoundError:\n", + " return False\n", + "\n", + "\n", + "if in_google_colab():\n", + " # Pull required modules from this example\n", + " !git clone -b main https://github.com/zenml-io/zenml\n", + " !cp -r zenml/examples/quickstart/* .\n", + " !rm -rf zenml\n", + "\n", + "\n", + "# Common imports and setup\n", + "if CLOUD_PROVIDER.lower() == \"gcp\":\n", + " !pip install -r requirements_gcp.txt\n", + "\n", + "elif CLOUD_PROVIDER.lower() == \"aws\":\n", + " !pip install -r requirements_aws.txt\n", + "\n", + "elif CLOUD_PROVIDER.lower() == \"azure\":\n", + " !pip install -r requirements_azure.txt\n", + "\n", + "else: # In this case the second half of the notebook won't work for you\n", + " !pip install -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f76f562e", + "metadata": {}, + "outputs": [], + "source": [ + "# Restart Kernel to ensure all libraries are properly loaded\n", + "import IPython\n", + "\n", + "IPython.Application.instance().kernel.do_shutdown(restart=True)" + ] + }, + { + "cell_type": "markdown", + "id": "3b044374", + "metadata": {}, + "source": [ + "\n", + "Please wait for the installation to complete before running subsequent cells. At\n", + "the end of the installation, the notebook kernel will restart." + ] + }, + { + "cell_type": "markdown", + "id": "966ce581", + "metadata": {}, + "source": [ + "## ☁️ Step 1: Connect to your ZenML Server\n", + "To run this quickstart you need to connect to a ZenML Server. You can deploy it [yourself on your own infrastructure](https://docs.zenml.io/getting-started/deploying-zenml) or try it out for free, no credit-card required in our [ZenML Pro managed service](https://zenml.io/pro)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2587315", + "metadata": {}, + "outputs": [], + "source": [ + "zenml_server_url = (\n", + " None # INSERT URL TO SERVER HERE in the form \"https://URL_TO_SERVER\"\n", + ")\n", + "\n", + "assert zenml_server_url\n", + "\n", + "!zenml connect --url $zenml_server_url" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f78a2f42-7a53-45f1-b45b-77bfc3762260", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize ZenML and define the root for imports and docker builds\n", + "!zenml init\n", + "\n", + "!zenml stack set default" + ] + }, + { + "cell_type": "markdown", + "id": "35e48460", + "metadata": {}, + "source": [ + "## 🥇 Step 2: Build and run your first pipeline\n", + "\n", + "In this quickstart we'll be working with a small dataset of sentences in old english paired with more modern formulations. The task is a text-to-text transformation.\n", + "\n", + "When you're getting started with a machine learning problem you'll want to break down your code into distinct functions that load your data, bring it into the correct shape and finally produce a model. This is the experimentation phase where we try to massage our data into the right format and feed it into our model training.\n", + "\n", + "\"Experimentation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cd974d1", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from datasets import Dataset\n", + "from typing_extensions import Annotated\n", + "\n", + "from zenml import step\n", + "\n", + "PROMPT = \"\" # In case you want to also use a prompt you can set it here\n", + "\n", + "\n", + "def read_data_from_url(url):\n", + " \"\"\"Reads data from url.\n", + "\n", + " Assumes the individual data points are linebreak separated\n", + " and input, targets are separated by a `|` pipe.\n", + " \"\"\"\n", + " inputs = []\n", + " targets = []\n", + "\n", + " response = requests.get(url)\n", + " response.raise_for_status() # Raise an exception for bad responses\n", + "\n", + " for line in response.text.splitlines():\n", + " old, modern = line.strip().split(\"|\")\n", + " inputs.append(f\"{PROMPT}{old}\")\n", + " targets.append(modern)\n", + "\n", + " return {\"input\": inputs, \"target\": targets}\n", + "\n", + "\n", + "@step\n", + "def load_data(\n", + " data_url: str,\n", + ") -> Annotated[Dataset, \"full_dataset\"]:\n", + " \"\"\"Load and prepare the dataset.\"\"\"\n", + "\n", + " # Fetch and process the data\n", + " data = read_data_from_url(data_url)\n", + "\n", + " # Convert to Dataset\n", + " return Dataset.from_dict(data)" + ] + }, + { + "cell_type": "markdown", + "id": "b6286b67", + "metadata": {}, + "source": [ + "ZenML is built in a way that allows you to experiment with your data and build\n", + "your pipelines one step at a time. If you want to call this function to see how it\n", + "works, you can just call it directly. Here we take a look at the first few rows\n", + "of your training dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d838e2ea", + "metadata": {}, + "outputs": [], + "source": [ + "data_source = \"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\"\n", + "\n", + "dataset = load_data(data_url=data_source)\n", + "print(f\"Input: {dataset['input'][1]} - Target: {dataset['target'][1]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "28c05291", + "metadata": {}, + "source": [ + "Everything looks as we'd expect and the input/output pair looks to be in the right format 🥳.\n", + "\n", + "For the sake of this quickstart we have prepared a few steps in the steps-directory. We'll now connect these together into a pipeline. To do this simply plug multiple steps together through their inputs and outputs. Then just add the `@pipeline` decorator to the function that connects the steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b50a9537", + "metadata": {}, + "outputs": [], + "source": [ + "import materializers\n", + "from steps import (\n", + " evaluate_model,\n", + " load_data,\n", + " split_dataset,\n", + " test_model,\n", + " tokenize_data,\n", + " train_model,\n", + ")\n", + "from steps.model_trainer import T5_Model\n", + "\n", + "from zenml import Model, pipeline\n", + "from zenml.client import Client\n", + "\n", + "assert materializers\n", + "\n", + "# Initialize the ZenML client to fetch objects from the ZenML Server\n", + "client = Client()\n", + "\n", + "Client().activate_stack(\n", + " \"default\"\n", + ") # We will start by using the default stack which is local\n", + "\n", + "model_name = \"YeOldeEnglishTranslator\"\n", + "model = Model(\n", + " name=\"YeOldeEnglishTranslator\",\n", + " description=\"Model to translate from old to modern english\",\n", + " tags=[\"quickstart\", \"llm\", \"t5\"],\n", + ")\n", + "\n", + "\n", + "@pipeline(model=model)\n", + "def english_translation_pipeline(\n", + " data_url: str,\n", + " model_type: T5_Model,\n", + " per_device_train_batch_size: int,\n", + " gradient_accumulation_steps: int,\n", + " dataloader_num_workers: int,\n", + " num_train_epochs: int = 5,\n", + "):\n", + " \"\"\"Define a pipeline that connects the steps.\"\"\"\n", + " full_dataset = load_data(data_url)\n", + " tokenized_dataset, tokenizer = tokenize_data(\n", + " dataset=full_dataset, model_type=model_type\n", + " )\n", + " tokenized_train_dataset, tokenized_eval_dataset, tokenized_test_dataset = (\n", + " split_dataset(\n", + " tokenized_dataset,\n", + " train_size=0.7,\n", + " test_size=0.1,\n", + " eval_size=0.2,\n", + " subset_size=0.1, # We use a subset of the dataset to speed things up\n", + " random_state=42,\n", + " )\n", + " )\n", + " model = train_model(\n", + " tokenized_dataset=tokenized_train_dataset,\n", + " model_type=model_type,\n", + " num_train_epochs=num_train_epochs,\n", + " per_device_train_batch_size=per_device_train_batch_size,\n", + " gradient_accumulation_steps=gradient_accumulation_steps,\n", + " dataloader_num_workers=dataloader_num_workers,\n", + " )\n", + " evaluate_model(model=model, tokenized_dataset=tokenized_eval_dataset)\n", + " test_model(\n", + " model=model,\n", + " tokenized_test_dataset=tokenized_test_dataset,\n", + " tokenizer=tokenizer,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "7cd73c23", + "metadata": {}, + "source": [ + "We're ready to run the pipeline now, which we can do just as with the step - by calling the\n", + "pipeline function itself:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e0aa9af", + "metadata": {}, + "outputs": [], + "source": [ + "# Run the pipeline and configure some parameters at runtime\n", + "pipeline_run = english_translation_pipeline(\n", + " data_url=\"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\",\n", + " model_type=\"t5-small\",\n", + " num_train_epochs=1, # to make this demo fast, we start at 1 epoch\n", + " per_device_train_batch_size=2,\n", + " gradient_accumulation_steps=4,\n", + " dataloader_num_workers=4,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6c42078a", + "metadata": {}, + "source": [ + "As you can see the pipeline has run successfully. Here is a sneak-peak of the dashboard view into this pipeline. The URL for this view can be found in the logs.\n", + "\n", + "\"Dashboard\n", + "\n", + "This isn't all that the ZenML Dashboard has to offer, if you navigate over to the ZenML Model control plane, you'll also find the produced model along with a lot of important metadata.\n", + "\n", + "\"Model\n", + "\n", + "Here you'll also see a collection of example Input-Output pairs. As you can see, the model is currently not performing its task well." + ] + }, + { + "cell_type": "markdown", + "id": "a037f09d", + "metadata": {}, + "source": [ + "We can now access the trained model and it's tokenizer from the ZenML Model Control Plane. This will allow us to interact with the model directly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53e514ac-1a0a-49a0-b8a4-e33cee12c765", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "# load the model object\n", + "model = client.get_model_version(model_name).get_model_artifact(\"model\").load()\n", + "tokenizer = (\n", + " client.get_model_version(model_name).get_artifact(\"tokenizer\").load()\n", + ")\n", + "\n", + "test_text = \"I do desire we may be better strangers\" # Insert your own test sentence here\n", + "\n", + "input_ids = tokenizer(\n", + " test_text,\n", + " return_tensors=\"pt\",\n", + " max_length=128,\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + ").input_ids\n", + "\n", + "with torch.no_grad():\n", + " outputs = model.generate(\n", + " input_ids,\n", + " max_length=128,\n", + " num_return_sequences=1,\n", + " no_repeat_ngram_size=2,\n", + " top_k=50,\n", + " top_p=0.95,\n", + " temperature=0.7,\n", + " )\n", + "\n", + "decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", + "\n", + "print(decoded_output)" + ] + }, + { + "cell_type": "markdown", + "id": "1e653c7a-4073-424e-8a59-c69f49526b96", + "metadata": {}, + "source": [ + "## Lets recap what we've done so far\n", + "\n", + "We created a modular pipeline, this pipeline is modularly constructed from different steps. We have shown that this pipeline runs locally.\n", + "\n", + "As expected, the modcel does not yet solve its task. To train a model that can solve our task well, we would have to train a larger model for longer. For this, we'll need to move away from our local environment. " + ] + }, + { + "cell_type": "markdown", + "id": "8c28b474", + "metadata": {}, + "source": [ + "# ⌚ Step 3: Scale it up in the cloud" + ] + }, + { + "cell_type": "markdown", + "id": "a791b32b-f6be-4ae2-867c-5e628f363858", + "metadata": {}, + "source": [ + "Our last section confirmed to us, that the pipeline works. Let's now run the pipeline in the environment of your choice.\n", + "\n", + "For you to be able to try this step, you will need to have access to a cloud environment (AWS, GCP, AZURE). ZenML wraps around all the major cloud providers and orchestration tools and lets you easily deploy your code onto them.\n", + "\n", + "To do this lets head over to the `Stack` section of your ZenML Dashboard. Here you'll be able to either connect to an existing or deploy a new environment. Choose on of the options presented to you there and come back when you have a stack ready to go. \n", + "\n", + "\"Stack\n", + "\n", + "Then proceed to the section below. Also be sure that you are running with a remote ZenML server (see Step 1 above)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e7a90b5-78c3-4019-8a81-671b5d62d470", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.environment import Environment\n", + "\n", + "# Set the cloud provider here\n", + "CLOUD_PROVIDER = None # Set this to \"GCP\", \"AWS\" or \"AZURE\"\n", + "assert CLOUD_PROVIDER\n", + "\n", + "# Set the name of the stack that you created within zenml\n", + "stack_name = None # Set this\n", + "assert stack_name # Set your stack, follow instruction above\n", + "\n", + "from zenml import pipeline\n", + "from zenml.client import Client\n", + "from zenml.config import DockerSettings\n", + "\n", + "settings = {}\n", + "\n", + "# Common imports and setup\n", + "if CLOUD_PROVIDER.lower() == \"gcp\":\n", + " parent_image = (\n", + " \"zenmldocker/zenml-public-pipelines:quickstart-0.68.0-py3.11-gcp\"\n", + " )\n", + " skip_build = True\n", + "\n", + "elif CLOUD_PROVIDER.lower() == \"aws\":\n", + " from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import (\n", + " SagemakerOrchestratorSettings,\n", + " )\n", + "\n", + " parent_image = \"339712793861.dkr.ecr.eu-central-1.amazonaws.com/zenml-public-pipelines:quickstart-0.68.0-py3.11-aws\"\n", + " skip_build = True # if you switch this to False, you need to remove the parent image\n", + "\n", + " settings[\"orchestrator.sagemaker\"] = SagemakerOrchestratorSettings(\n", + " instance_type=\"ml.m5.4xlarge\"\n", + " )\n", + "\n", + "elif CLOUD_PROVIDER.lower() == \"azure\":\n", + " parent_image = (\n", + " \"zenmldocker/zenml-public-pipelines:quickstart-0.68.0-py3.11-azure\"\n", + " )\n", + " skip_build = True\n", + "\n", + "Client().activate_stack(stack_name)\n", + "\n", + "data_source = \"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\"\n", + "\n", + "# We've prebuilt a docker image for this quickstart to speed things up, feel free to delete the DockerSettings to build from scratch\n", + "settings[\"docker\"] = DockerSettings(\n", + " parent_image=parent_image, skip_build=skip_build\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8a5cd031-0661-4073-a5ea-6aac5f989212", + "metadata": {}, + "source": [ + "If you are in a google colab you might need to rerun the cell above a second time after the runtime restarted." + ] + }, + { + "cell_type": "markdown", + "id": "95f17b7a-5a82-4975-b9bd-6a63fbb97a68", + "metadata": {}, + "source": [ + "## 🚀 Ready to launch" + ] + }, + { + "cell_type": "markdown", + "id": "df14f30c-9a8e-46ca-ba44-cf16ea715dac", + "metadata": {}, + "source": [ + "We now have configured a ZenML stack that represents your very own cloud infrastructure. For the next pipeline run, we'll be training the same t5 model (`t5_small`) on your own infrastrucutre.\n", + "\n", + "Note: The whole process may take a bit longer the first time around, as your pipeline code needs to be built into docker containers to be run in the orchestration environment of your stack. Any consecutive run of the pipeline, even with different parameters set, will not take as long again thanks to docker caching." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfad9bd5", + "metadata": {}, + "outputs": [], + "source": [ + "# In the case that we are within a colab environment we want to remove\n", + "# these folders\n", + "if Environment.in_google_colab():\n", + " !rm -rf results\n", + " !rm -rf sample_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12e758fe-6ea3-42ff-bea8-33953135bf6b", + "metadata": {}, + "outputs": [], + "source": [ + "from pipelines import (\n", + " english_translation_pipeline,\n", + ")\n", + "\n", + "from zenml import Model\n", + "\n", + "model_name = \"YeOldeEnglishTranslator\"\n", + "model = Model(\n", + " name=\"YeOldeEnglishTranslator\",\n", + ")\n", + "\n", + "pipeline_run = english_translation_pipeline.with_options(\n", + " settings=settings, model=model\n", + ")(\n", + " data_url=\"https://storage.googleapis.com/zenml-public-bucket/quickstart-files/translations.txt\",\n", + " model_type=\"t5-small\",\n", + " num_train_epochs=2,\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=4,\n", + " dataloader_num_workers=0, # Some cloud environment don't support multiple of these\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c7eef2c6-6dfb-4b67-9883-594a0df20173", + "metadata": {}, + "source": [ + "You did it! You build a pipeline locally, verified that all its parts work well together and now are running it on a production environment\n", + "\n", + "\"Pipeline\n", + "\n", + "Depending on the backend you chose, you can also go inspect your run in the orchestrator of your choice. Here is an example on GCP Vertex:\n", + "\n", + "\"Pipeline" + ] + }, + { + "cell_type": "markdown", + "id": "1a03054e-8b3e-4edb-9d87-82ae51693d2d", + "metadata": {}, + "source": [ + "## Adding Accelerators\n", + "Each of the cloud providers allows users to add accelerators to their serverless offerings. Here's what you need to add to the pipeline settings in order to unlock gpus. Keep in mind, that you might have to increase your quotas within the cloud providers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2f7c3c5-dcde-4824-a012-f3da224cb8d8", + "metadata": {}, + "outputs": [], + "source": [ + "from zenml.config import ResourceSettings\n", + "\n", + "if CLOUD_PROVIDER == \"GCP\":\n", + " from zenml.integrations.gcp.flavors.vertex_orchestrator_flavor import (\n", + " VertexOrchestratorSettings,\n", + " )\n", + "\n", + " # find out about your options here: https://docs.zenml.io/stack-components/orchestrators/vertex#additional-configuration\n", + "\n", + " english_translation_pipeline.with_options(\n", + " settings={\n", + " \"orchestrator.vertex\": VertexOrchestratorSettings(\n", + " node_selector_constraint=(\n", + " \"cloud.google.com/gke-accelerator\",\n", + " \"NVIDIA_TESLA_P4\",\n", + " )\n", + " ),\n", + " \"resources\": ResourceSettings(memory=\"32GB\", gpu_count=1),\n", + " }\n", + " )\n", + "if CLOUD_PROVIDER == \"AWS\":\n", + " from zenml.integrations.aws.flavors.sagemaker_orchestrator_flavor import (\n", + " SagemakerOrchestratorSettings,\n", + " )\n", + "\n", + " # find out your options here: https://docs.zenml.io/stack-components/orchestrators/sagemaker#configuration-at-pipeline-or-step-level\n", + "\n", + " english_translation_pipeline.with_options(\n", + " settings={\n", + " \"orchestrator.sagemaker\": SagemakerOrchestratorSettings(\n", + " instance_type=\"ml.p2.xlarge\"\n", + " )\n", + " }\n", + " )\n", + "if CLOUD_PROVIDER == \"AZURE\":\n", + " from zenml.integrations.azure.flavors import AzureMLOrchestratorSettings\n", + "\n", + " # find out your options here: https://docs.zenml.io/stack-components/orchestrators/azureml#settings\n", + " # The quickest way is probably to configure a compute-instance in azure ml. This instance should contain\n", + " # a gpu. Then specify the name of the compute instance here.\n", + "\n", + " compute_name = None # Insert the name of your compute instance here\n", + "\n", + " english_translation_pipeline.with_options(\n", + " settings={\n", + " \"orchestrator.azureml\": AzureMLOrchestratorSettings(\n", + " mode=\"compute-instance\", compute_name=compute_name\n", + " )\n", + " }\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "8231677c-1fd6-4ec3-8c8c-47fd9406072e", + "metadata": {}, + "source": [ + "## Now it's up to you" + ] + }, + { + "cell_type": "markdown", + "id": "90c31e0d-dfab-4692-a406-0dc439f25443", + "metadata": {}, + "source": [ + "You can now start worrying about making the model actually work well on our toy example or any other dataset you like.\n", + "\n", + "Here are some things that you could do:\n", + "\n", + "* Iterate on the training data and its tokenization\n", + "* You can switch out the model itself. Instead of `model_type=\"t5_small\"` you could use `model_type=\"t5_large\"` for example\n", + "* You can train for longer by increasing the `num_train_epochs=xxx`. In order to speed this up you can also add accelerators to your orchestrators. Learn more about this in the section below.\n", + "\n", + "No matter what avenue you choose to actually make the model work, we would love to see how you did it, so please reach out and share your solution with us either on [**Slack Community**](https://zenml.io/slack) or through our email hello@zenml.io." + ] + }, + { + "cell_type": "markdown", + "id": "594ee4fc-f102-4b99-bdc3-2f1670c87679", + "metadata": {}, + "source": [ + "## Further exploration\n", + "\n", + "This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more\n", + "about the capabilities of ZenML. For example, you might want to:\n", + "\n", + "- [Deploy ZenML](https://docs.zenml.io/user-guide/production-guide/connect-deployed-zenml) to collaborate with your colleagues.\n", + "- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guide/production-guide/cloud-stack).\n", + "- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow).\n", + "\n", + "## What next?\n", + "\n", + "* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family!\n", + "* If you want to quickly get started with ZenML, check out [ZenML Pro](https://zenml.io/pro)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c560354d-9e78-4061-aaff-2e6213229911", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 } diff --git a/examples/quickstart/requirements.txt b/examples/quickstart/requirements.txt index 6f083e05b40..d97f9f7c4a0 100644 --- a/examples/quickstart/requirements.txt +++ b/examples/quickstart/requirements.txt @@ -1,4 +1,4 @@ -zenml[server]==0.67.0 +zenml[server]==0.68.0 notebook pyarrow datasets diff --git a/examples/quickstart/requirements_aws.txt b/examples/quickstart/requirements_aws.txt index 6cd1fc7628c..52ecae6394c 100644 --- a/examples/quickstart/requirements_aws.txt +++ b/examples/quickstart/requirements_aws.txt @@ -1,4 +1,4 @@ -zenml[server]==0.67.0 +zenml[server]==0.68.0 notebook pyarrow datasets diff --git a/examples/quickstart/requirements_azure.txt b/examples/quickstart/requirements_azure.txt index eb7631c0385..2adf7b540eb 100644 --- a/examples/quickstart/requirements_azure.txt +++ b/examples/quickstart/requirements_azure.txt @@ -1,4 +1,4 @@ -zenml[server]==0.67.0 +zenml[server]==0.68.0 notebook pyarrow datasets diff --git a/examples/quickstart/requirements_gcp.txt b/examples/quickstart/requirements_gcp.txt index 6c07e4612c6..0d2e290de88 100644 --- a/examples/quickstart/requirements_gcp.txt +++ b/examples/quickstart/requirements_gcp.txt @@ -1,4 +1,4 @@ -zenml[server]==0.67.0 +zenml[server]==0.68.0 notebook pyarrow datasets diff --git a/pyproject.toml b/pyproject.toml index c569b4c6ae0..69cc144000d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "zenml" -version = "0.67.0" +version = "0.68.0" packages = [{ include = "zenml", from = "src" }] description = "ZenML: Write production-ready ML code." authors = ["ZenML GmbH "] diff --git a/scripts/install-zenml-dev.sh b/scripts/install-zenml-dev.sh index c3ca2f61f57..7d0e9521c93 100755 --- a/scripts/install-zenml-dev.sh +++ b/scripts/install-zenml-dev.sh @@ -36,7 +36,7 @@ install_integrations() { # figure out the python version python_version=$(python -c "import sys; print('.'.join(map(str, sys.version_info[:2])))") - ignore_integrations="feast label_studio bentoml seldon pycaret skypilot_aws skypilot_gcp skypilot_azure pigeon prodigy" + ignore_integrations="feast label_studio bentoml seldon pycaret skypilot_aws skypilot_gcp skypilot_azure pigeon prodigy argilla" # Ignore tensorflow and deepchecks only on Python 3.12 if [ "$python_version" = "3.12" ]; then diff --git a/scripts/lint.sh b/scripts/lint.sh index 26bbcdaa3ed..914190eba5c 100755 --- a/scripts/lint.sh +++ b/scripts/lint.sh @@ -30,7 +30,7 @@ fi # checks for yaml formatting errors if [ "$SKIP_YAMLFIX" = false ]; then - yamlfix --check .github tests --exclude "dependabot.yml" + yamlfix --check .github tests -e "dependabot.yml" -e "workflows/release_prepare.yml" -e "workflows/release_finalize.yml" fi # autoflake replacement: checks for unused imports and variables diff --git a/scripts/test-migrations.sh b/scripts/test-migrations.sh index 444550b221d..f18ef4675de 100755 --- a/scripts/test-migrations.sh +++ b/scripts/test-migrations.sh @@ -23,12 +23,21 @@ else fi # List of versions to test -VERSIONS=("0.40.3" "0.43.0" "0.44.3" "0.45.6" "0.47.0" "0.50.0" "0.51.0" "0.52.0" "0.53.1" "0.54.1" "0.55.5" "0.56.4" "0.57.1" "0.60.0" "0.61.0" "0.62.0" "0.63.0" "0.64.0" "0.65.0") +VERSIONS=("0.40.3" "0.43.0" "0.44.3" "0.45.6" "0.47.0" "0.50.0" "0.51.0" "0.52.0" "0.53.1" "0.54.1" "0.55.5" "0.56.4" "0.57.1" "0.60.0" "0.61.0" "0.62.0" "0.63.0" "0.64.0" "0.65.0" "0.68.0") -# Include latest release dynamically, if not there already -CURRENT_VERSION=$(cat src/zenml/VERSION) -if [[ ! " ${VERSIONS[@]} " =~ " ${CURRENT_VERSION} " ]]; then - VERSIONS+=("${CURRENT_VERSION}") +# Try to get the latest version using pip index +version=$(pip index versions zenml 2>/dev/null | grep -v YANKED | head -n1 | awk '{print $2}' | tr -d '()') + +# Verify we got a version +if [ -z "$version" ]; then + echo "Error: Could not find the latest version for zenml" >&2 + return 1 +fi + +LATEST_VERSION=$(echo $version | xargs) + +if [[ ! " ${VERSIONS[@]} " =~ " ${LATEST_VERSION} " ]]; then + VERSIONS+=("${LATEST_VERSION}") fi # Function to compare semantic versions diff --git a/src/zenml/VERSION b/src/zenml/VERSION index 24c41e038fb..c657195a953 100644 --- a/src/zenml/VERSION +++ b/src/zenml/VERSION @@ -1 +1 @@ -0.67.0 \ No newline at end of file +0.68.0 \ No newline at end of file diff --git a/src/zenml/integrations/argilla/__init__.py b/src/zenml/integrations/argilla/__init__.py index 3953cf3863b..9d87666f673 100644 --- a/src/zenml/integrations/argilla/__init__.py +++ b/src/zenml/integrations/argilla/__init__.py @@ -26,7 +26,7 @@ class ArgillaIntegration(Integration): NAME = ARGILLA REQUIREMENTS = [ - "argilla>=1.20.0,<2", + "argilla>=2.0.0", ] @classmethod diff --git a/src/zenml/integrations/argilla/annotators/argilla_annotator.py b/src/zenml/integrations/argilla/annotators/argilla_annotator.py index 09e0790c56b..fe04ce40dc5 100644 --- a/src/zenml/integrations/argilla/annotators/argilla_annotator.py +++ b/src/zenml/integrations/argilla/annotators/argilla_annotator.py @@ -14,11 +14,12 @@ """Implementation of the Argilla annotation integration.""" import json -from typing import Any, List, Tuple, Type, cast +import webbrowser +from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast import argilla as rg -from argilla.client.client import Argilla as ArgillaClient -from argilla.client.sdk.commons.errors import BaseClientError, NotFoundApiError +from argilla._exceptions._api import ArgillaAPIError +from argilla.client import Argilla as ArgillaClient from zenml.annotators.base_annotator import BaseAnnotator from zenml.integrations.argilla.flavors import ( @@ -67,7 +68,7 @@ def get_url(self) -> str: ) def _get_client(self) -> ArgillaClient: - """Gets Argilla client. + """Gets the Argilla client. Returns: Argilla client. @@ -75,7 +76,7 @@ def _get_client(self) -> ArgillaClient: config = self.config init_kwargs = {"api_url": self.get_url()} - # set the API key from the secret or using settings + # Set the API key from the secret or using settings authentication_secret = self.get_authentication_secret() if config.api_key and authentication_secret: api_key = config.api_key @@ -92,194 +93,333 @@ def _get_client(self) -> ArgillaClient: if api_key: init_kwargs["api_key"] = api_key - if config.workspace is not None: - init_kwargs["workspace"] = config.workspace - if config.extra_headers is not None: - init_kwargs["extra_headers"] = json.loads(config.extra_headers) + if config.headers is not None: + init_kwargs["headers"] = json.loads(config.headers) if config.httpx_extra_kwargs is not None: init_kwargs["httpx_extra_kwargs"] = json.loads( config.httpx_extra_kwargs ) try: - _ = rg.active_client() - except BaseClientError: - rg.init(**init_kwargs) - return rg.active_client() + _ = rg.Argilla(**init_kwargs).me + except ArgillaAPIError as e: + logger.error(f"Failed to verify the Argilla instance: {str(e)}") + return rg.Argilla(**init_kwargs) - def get_url_for_dataset(self, dataset_name: str) -> str: + def get_url_for_dataset(self, dataset_name: str, **kwargs: Any) -> str: """Gets the URL of the annotation interface for the given dataset. Args: dataset_name: The name of the dataset. + **kwargs: Additional keyword arguments to pass to the Argilla client. + -workspace: The name of the workspace. By default, the first available. Returns: - The URL of the annotation interface. + The URL of of the dataset annotation interface. """ - dataset_id = self.get_dataset(dataset_name=dataset_name).id + workspace = kwargs.get("workspace") + + dataset_id = self.get_dataset( + dataset_name=dataset_name, workspace=workspace + ).id return f"{self.get_url()}/dataset/{dataset_id}/annotation-mode" - def get_datasets(self) -> List[Any]: + def get_datasets(self, **kwargs: Any) -> List[Any]: """Gets the datasets currently available for annotation. + Args: + **kwargs: Additional keyword arguments to pass to the Argilla client. + -workspace: The name of the workspace. By default, the first available. + If set, only the datasets in the workspace will be returned. + Returns: A list of datasets. """ - old_datasets = self._get_client().list_datasets() - new_datasets = rg.FeedbackDataset.list() + workspace = kwargs.get("workspace") + + if workspace is None: + datasets = list(self._get_client().datasets) + else: + datasets = list(self._get_client().workspaces(workspace).datasets) + + return datasets + + def get_dataset_names(self, **kwargs: Any) -> List[str]: + """Gets the names of the datasets. + + Args: + **kwargs: Additional keyword arguments to pass to the Argilla client. + -workspace: The name of the workspace. By default, the first available. + If set, only the dataset names in the workspace will be returned. + + Returns: + A list of dataset names. + """ + workspace = kwargs.get("workspace") + + if workspace is None: + dataset_names = [dataset.name for dataset in self.get_datasets()] + else: + dataset_names = [ + dataset.name + for dataset in self.get_datasets(workspace=workspace) + ] + + return dataset_names + + def _get_data_by_status( + self, dataset_name: str, status: str, workspace: Optional[str] + ) -> Any: + """Gets the dataset containing the data with the specified status. + + Args: + dataset_name: The name of the dataset. + status: The response status to filter by ('completed' for labeled, + 'pending' for unlabeled). + workspace: The name of the workspace. By default, the first available. + + Returns: + The list of records with the specified status. + """ + dataset = self.get_dataset( + dataset_name=dataset_name, workspace=workspace + ) - # Deduplicate datasets based on their names - dataset_names = set() - deduplicated_datasets = [] - for dataset in new_datasets + old_datasets: - if dataset.name not in dataset_names: - dataset_names.add(dataset.name) - deduplicated_datasets.append(dataset) + query = rg.Query(filter=rg.Filter([("status", "==", status)])) - return deduplicated_datasets + return dataset.records( + query=query, + with_suggestions=True, + with_vectors=True, + with_responses=True, + ).to_list() - def get_dataset_stats(self, dataset_name: str) -> Tuple[int, int]: + def get_dataset_stats( + self, dataset_name: str, **kwargs: Any + ) -> Tuple[int, int]: """Gets the statistics of the given dataset. Args: dataset_name: The name of the dataset. + **kwargs: Additional keyword arguments to pass to the Argilla client. + -workspace: The name of the workspace. By default, the first available. Returns: A tuple containing (labeled_task_count, unlabeled_task_count) for the dataset. """ - dataset = self.get_dataset(dataset_name=dataset_name) + workspace = kwargs.get("workspace") + labeled_task_count = len( - dataset.filter_by(response_status="submitted") + self._get_data_by_status( + dataset_name=dataset_name, + status="completed", + workspace=workspace, + ) ) unlabeled_task_count = len( - dataset.filter_by(response_status="pending") + self._get_data_by_status( + dataset_name=dataset_name, + status="pending", + workspace=workspace, + ) ) + return (labeled_task_count, unlabeled_task_count) - def add_dataset(self, **kwargs: Any) -> Any: - """Registers a dataset for annotation. + def launch(self, **kwargs: Any) -> None: + """Launches the annotation interface. + + Args: + **kwargs: Additional keyword arguments to pass to the Argilla client. + """ + url = kwargs.get("api_url") or self.get_url() - You must pass a `dataset_name` and a `dataset` object to this method. + if self._get_client(): + webbrowser.open(url, new=1, autoraise=True) + else: + logger.warning( + "Could not launch annotation interface" + "because the connection could not be established." + ) + + def add_dataset(self, **kwargs: Any) -> Any: + """Create a dataset for annotation. Args: - **kwargs: Additional keyword arguments to pass to the Argilla - client. + **kwargs: Additional keyword arguments to pass to the Argilla client. + -dataset_name: The name of the dataset. + -settings: The settings for the dataset. + -workspace: The name of the workspace. By default, the first available. Returns: An Argilla dataset object. Raises: - ValueError: if 'dataset_name' and 'dataset' aren't provided. + ValueError: if `dataset_name` or `settings` aren't provided. + RuntimeError: if the workspace creation fails. + RuntimeError: if the dataset creation fails. """ dataset_name = kwargs.get("dataset_name") - dataset = kwargs.get("dataset") + settings = kwargs.get("settings") + workspace = kwargs.get("workspace") - if not dataset_name: - raise ValueError("`dataset_name` keyword argument is required.") - elif dataset is None: - raise ValueError("`dataset` keyword argument is required.") + if dataset_name is None or settings is None: + raise ValueError( + "`dataset_name` and `settings` keyword arguments are required." + ) + + if workspace is None and not self._get_client().workspaces: + workspace_to_create = rg.Workspace(name="argilla") + try: + workspace = workspace_to_create.create() + except Exception as e: + raise RuntimeError( + "Failed to create the `argilla` workspace." + ) from e try: - logger.info(f"Pushing dataset '{dataset_name}' to Argilla...") - dataset.push_to_argilla(name=dataset_name) - logger.info(f"Dataset '{dataset_name}' pushed successfully.") + dataset = rg.Dataset( + name=dataset_name, workspace=workspace, settings=settings + ) + logger.info(f"Creating the dataset '{dataset_name}' in Argilla...") + dataset.create() + logger.info(f"Dataset '{dataset_name}' successfully created.") + return self.get_dataset( + dataset_name=dataset_name, workspace=workspace + ) except Exception as e: logger.error( - f"Failed to push dataset '{dataset_name}' to Argilla: {str(e)}" + f"Failed to create dataset '{dataset_name}' in Argilla: {str(e)}" ) - raise ValueError( - f"Failed to push dataset to Argilla: {str(e)}" + raise RuntimeError( + f"Failed to create the dataset '{dataset_name}' in Argilla: {str(e)}" ) from e - return self.get_dataset(dataset_name=dataset_name) - def delete_dataset(self, **kwargs: Any) -> None: - """Deletes a dataset from the annotation interface. + def add_records( + self, + dataset_name: str, + records: Union[Any, List[Dict[str, Any]]], + workspace: Optional[str] = None, + mapping: Optional[Dict[str, str]] = None, + ) -> Any: + """Add records to an Argilla dataset for annotation. Args: - **kwargs: Additional keyword arguments to pass to the Argilla - client. + dataset_name: The name of the dataset. + records: The records to add to the dataset. + workspace: The name of the workspace. By default, the first available. + mapping: The mapping of the records to the dataset fields. By default, None. Raises: - ValueError: If the dataset name is not provided. + RuntimeError: If the records cannot be loaded to Argilla. """ - dataset_name = kwargs.get("dataset_name") - if not dataset_name: - raise ValueError("`dataset_name` keyword argument is required.") + dataset = self.get_dataset( + dataset_name=dataset_name, workspace=workspace + ) try: - self._get_client().delete(name=dataset_name) - self.get_dataset(dataset_name=dataset_name).delete() - logger.info(f"Dataset '{dataset_name}' deleted successfully.") - except ValueError: - logger.warning( - f"Dataset '{dataset_name}' not found. Skipping deletion." + logger.info( + f"Loading the records to '{dataset_name}' in Argilla..." ) + dataset.records.log(records=records, mapping=mapping) + logger.info( + f"Records loaded successfully to Argilla for '{dataset_name}'." + ) + except Exception as e: + logger.error( + f"Failed to load the records to Argilla for '{dataset_name}': {str(e)}" + ) + raise RuntimeError( + f"Failed to load the records to Argilla: {str(e)}" + ) from e def get_dataset(self, **kwargs: Any) -> Any: """Gets the dataset with the given name. Args: **kwargs: Additional keyword arguments to pass to the Argilla client. + -dataset_name: The name of the dataset. + -workspace: The name of the workspace. By default, the first available. Returns: - The Argilla DatasetModel object for the given name. + The Argilla Dataset for the given name and workspace, if specified. Raises: ValueError: If the dataset name is not provided or if the dataset does not exist. """ dataset_name = kwargs.get("dataset_name") + workspace = kwargs.get("workspace") + if not dataset_name: raise ValueError("`dataset_name` keyword argument is required.") try: - if rg.FeedbackDataset.from_argilla(name=dataset_name) is not None: - return rg.FeedbackDataset.from_argilla(name=dataset_name) + dataset = self._get_client().datasets( + name=dataset_name, workspace=workspace + ) + if dataset is None: + logger.error(f"Dataset '{dataset_name}' not found.") else: - return self._get_client().get_dataset(name=dataset_name) - except (NotFoundApiError, ValueError) as e: + return dataset + except ValueError as e: logger.error(f"Dataset '{dataset_name}' not found.") raise ValueError(f"Dataset '{dataset_name}' not found.") from e - def get_data_by_status(self, dataset_name: str, status: str) -> Any: - """Gets the dataset containing the data with the specified status. + def delete_dataset(self, **kwargs: Any) -> None: + """Deletes a dataset from the annotation interface. Args: - dataset_name: The name of the dataset. - status: The response status to filter by ('submitted' for labeled, - 'pending' for unlabeled). - - Returns: - The dataset containing the data with the specified status. + **kwargs: Additional keyword arguments to pass to the Argilla client. + -dataset_name: The name of the dataset. + -workspace: The name of the workspace. By default, the first available Raises: - ValueError: If the dataset name is not provided. + ValueError: If the dataset name is not provided or if the datasets + is not found. """ + dataset_name = kwargs.get("dataset_name") + workspace = kwargs.get("workspace") + if not dataset_name: - raise ValueError("`dataset_name` argument is required.") + raise ValueError("`dataset_name` keyword argument is required.") - return self.get_dataset(dataset_name=dataset_name).filter_by( - response_status=status - ) + try: + dataset = self.get_dataset( + dataset_name=dataset_name, workspace=workspace + ) + dataset.delete() + logger.info(f"Dataset '{dataset_name}' deleted successfully.") + except ValueError: + logger.warning( + f"Dataset '{dataset_name}' not found. Skipping deletion." + ) def get_labeled_data(self, **kwargs: Any) -> Any: """Gets the dataset containing the labeled data. Args: **kwargs: Additional keyword arguments to pass to the Argilla client. + -dataset_name: The name of the dataset. + -workspace: The name of the workspace. By default, the first available. Returns: - The dataset containing the labeled data. + The list of annotated records. Raises: ValueError: If the dataset name is not provided. """ - if dataset_name := kwargs.get("dataset_name"): - return self.get_data_by_status(dataset_name, status="submitted") - else: + dataset_name = kwargs.get("dataset_name") + workspace = kwargs.get("workspace") + + if not dataset_name: raise ValueError("`dataset_name` keyword argument is required.") + return self._get_data_by_status( + dataset_name, workspace=workspace, status="completed" + ) + def get_unlabeled_data(self, **kwargs: str) -> Any: """Gets the dataset containing the unlabeled data. @@ -287,12 +427,17 @@ def get_unlabeled_data(self, **kwargs: str) -> Any: **kwargs: Additional keyword arguments to pass to the Argilla client. Returns: - The dataset containing the unlabeled data. + The list of pending records for annotation. Raises: ValueError: If the dataset name is not provided. """ - if dataset_name := kwargs.get("dataset_name"): - return self.get_data_by_status(dataset_name, status="pending") - else: + dataset_name = kwargs.get("dataset_name") + workspace = kwargs.get("workspace") + + if not dataset_name: raise ValueError("`dataset_name` keyword argument is required.") + + return self._get_data_by_status( + dataset_name, workspace=workspace, status="pending" + ) diff --git a/src/zenml/integrations/argilla/flavors/argilla_annotator_flavor.py b/src/zenml/integrations/argilla/flavors/argilla_annotator_flavor.py index f14d5a86c4d..649c9eb4cd7 100644 --- a/src/zenml/integrations/argilla/flavors/argilla_annotator_flavor.py +++ b/src/zenml/integrations/argilla/flavors/argilla_annotator_flavor.py @@ -24,6 +24,7 @@ from zenml.config.base_settings import BaseSettings from zenml.integrations.argilla import ARGILLA_ANNOTATOR_FLAVOR from zenml.stack.authentication_mixin import AuthenticationConfigMixin +from zenml.utils import deprecation_utils from zenml.utils.secret_utils import SecretField if TYPE_CHECKING: @@ -43,19 +44,23 @@ class ArgillaAnnotatorSettings(BaseSettings): Attributes: instance_url: URL of the Argilla instance. api_key: The api_key for Argilla - workspace: The workspace to use for the annotation interface. port: The port to use for the annotation interface. - extra_headers: Extra headers to include in the request. + headers: Extra headers to include in the request. httpx_extra_kwargs: Extra kwargs to pass to the client. """ instance_url: str = DEFAULT_LOCAL_INSTANCE_URL api_key: Optional[str] = SecretField(default=None) - workspace: Optional[str] = "admin" - port: Optional[int] - extra_headers: Optional[str] = None + port: Optional[int] = DEFAULT_LOCAL_ARGILLA_PORT + headers: Optional[str] = None httpx_extra_kwargs: Optional[str] = None + extra_headers: Optional[str] = None + + _deprecation_validator = deprecation_utils.deprecate_pydantic_attributes( + ("extra_headers", "headers"), + ) + @field_validator("instance_url") @classmethod def ensure_instance_url_ends_without_slash(cls, instance_url: str) -> str: diff --git a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py index 342092416c7..c49bf5bed54 100644 --- a/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py +++ b/src/zenml/integrations/aws/orchestrators/sagemaker_orchestrator.py @@ -463,7 +463,13 @@ def prepare_or_run_pipeline( ) pipeline.create( - role_arn=self.config.execution_role, tags=settings.pipeline_tags + role_arn=self.config.execution_role, + tags=[ + {"Key": key, "Value": value} + for key, value in settings.pipeline_tags.items() + ] + if settings.pipeline_tags + else None, ) execution = pipeline.start() logger.warning( diff --git a/src/zenml/models/v2/core/step_run.py b/src/zenml/models/v2/core/step_run.py index 8284a19bcc9..588fcba50f9 100644 --- a/src/zenml/models/v2/core/step_run.py +++ b/src/zenml/models/v2/core/step_run.py @@ -156,6 +156,14 @@ class StepRunResponseBody(WorkspaceScopedResponseBody): """Response body for step runs.""" status: ExecutionStatus = Field(title="The status of the step.") + start_time: Optional[datetime] = Field( + title="The start time of the step run.", + default=None, + ) + end_time: Optional[datetime] = Field( + title="The end time of the step run.", + default=None, + ) inputs: Dict[str, "ArtifactVersionResponse"] = Field( title="The input artifact versions of the step run.", default={}, @@ -201,16 +209,6 @@ class StepRunResponseMetadata(WorkspaceScopedResponseMetadata): max_length=TEXT_FIELD_MAX_LENGTH, ) - # Timestamps - start_time: Optional[datetime] = Field( - title="The start time of the step run.", - default=None, - ) - end_time: Optional[datetime] = Field( - title="The end time of the step run.", - default=None, - ) - # References logs: Optional["LogsResponse"] = Field( title="Logs associated with this step run.", @@ -409,7 +407,7 @@ def start_time(self) -> Optional[datetime]: Returns: the value of the property. """ - return self.get_metadata().start_time + return self.get_body().start_time @property def end_time(self) -> Optional[datetime]: @@ -418,7 +416,7 @@ def end_time(self) -> Optional[datetime]: Returns: the value of the property. """ - return self.get_metadata().end_time + return self.get_body().end_time @property def logs(self) -> Optional["LogsResponse"]: diff --git a/src/zenml/zen_server/deploy/helm/Chart.yaml b/src/zenml/zen_server/deploy/helm/Chart.yaml index 43d77cdd201..ddc034a1607 100644 --- a/src/zenml/zen_server/deploy/helm/Chart.yaml +++ b/src/zenml/zen_server/deploy/helm/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: zenml -version: "0.67.0" +version: "0.68.0" description: Open source MLOps framework for portable production ready ML pipelines keywords: - mlops diff --git a/src/zenml/zen_server/deploy/helm/README.md b/src/zenml/zen_server/deploy/helm/README.md index c122990462d..258aeee97f8 100644 --- a/src/zenml/zen_server/deploy/helm/README.md +++ b/src/zenml/zen_server/deploy/helm/README.md @@ -20,8 +20,8 @@ ZenML is an open-source MLOps framework designed to help you create robust, main To install the ZenML chart directly from Amazon ECR, use the following command: ```bash -# example command for version 0.67.0 -helm install my-zenml oci://public.ecr.aws/zenml/zenml --version 0.67.0 +# example command for version 0.68.0 +helm install my-zenml oci://public.ecr.aws/zenml/zenml --version 0.68.0 ``` Note: Ensure you have OCI support enabled in your Helm client and that you are authenticated with Amazon ECR. diff --git a/src/zenml/zen_stores/migrations/versions/0.68.0_release.py b/src/zenml/zen_stores/migrations/versions/0.68.0_release.py new file mode 100644 index 00000000000..fa9c8267f68 --- /dev/null +++ b/src/zenml/zen_stores/migrations/versions/0.68.0_release.py @@ -0,0 +1,23 @@ +"""Release [0.68.0]. + +Revision ID: 0.68.0 +Revises: 1d8f30c54477 +Create Date: 2024-10-24 13:14:56.912159 + +""" + +# revision identifiers, used by Alembic. +revision = "0.68.0" +down_revision = "1d8f30c54477" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + """Upgrade database schema and/or data, creating a new revision.""" + pass + + +def downgrade() -> None: + """Downgrade database schema and/or data back to the previous revision.""" + pass diff --git a/src/zenml/zen_stores/schemas/step_run_schemas.py b/src/zenml/zen_stores/schemas/step_run_schemas.py index 07812b26ec4..c5478d3cb95 100644 --- a/src/zenml/zen_stores/schemas/step_run_schemas.py +++ b/src/zenml/zen_stores/schemas/step_run_schemas.py @@ -262,6 +262,8 @@ def to_model( body = StepRunResponseBody( user=self.user.to_model() if self.user else None, status=ExecutionStatus(self.status), + start_time=self.start_time, + end_time=self.end_time, inputs=input_artifacts, outputs=output_artifacts, created=self.created, @@ -278,8 +280,6 @@ def to_model( code_hash=self.code_hash, docstring=self.docstring, source_code=self.source_code, - start_time=self.start_time, - end_time=self.end_time, logs=self.logs.to_model() if self.logs else None, deployment_id=self.deployment_id, pipeline_run_id=self.pipeline_run_id,