diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 9d6b2d4005..cfc901a9e4 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -51,7 +51,7 @@ jobs: echo ${{ env.COMMIT_SHA }} > ./commit_sha echo ${{ env.PR_NUMBER }} > ./pr_number - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: doc-build-artifact path: optimum-habana/habana-doc-build/ diff --git a/.github/workflows/fast_tests.yml b/.github/workflows/fast_tests.yml index cdd7d1dbf5..5a1e982926 100644 --- a/.github/workflows/fast_tests.yml +++ b/.github/workflows/fast_tests.yml @@ -21,7 +21,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -36,7 +36,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/fast_tests.sh diffusers: name: Run tests for optimum.habana.diffusers @@ -46,7 +46,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -61,5 +61,5 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/fast_tests_diffusers.sh diff --git a/.github/workflows/slow_tests.yml b/.github/workflows/slow_tests.yml index d0fcb85051..e7fb736923 100644 --- a/.github/workflows/slow_tests.yml +++ b/.github/workflows/slow_tests.yml @@ -19,7 +19,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -31,7 +31,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/example_diff_tests.sh stable-diffusion: name: Test Stable Diffusion @@ -45,7 +45,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -57,7 +57,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_diffusers.sh deepspeed: name: Test DeepSpeed models @@ -72,7 +72,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -84,7 +84,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_deepspeed.sh multi-card: name: Test multi-card models @@ -99,7 +99,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -111,7 +111,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_8x.sh single-card: name: Test single-card models @@ -127,7 +127,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -139,7 +139,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_1x.sh albert-xxl-single-card: name: Test single-card ALBERT XXL @@ -158,7 +158,7 @@ jobs: - name: Pull image if: github.event.schedule == '0 21 * * 6' run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run test if: github.event.schedule == '0 21 * * 6' run: | @@ -171,7 +171,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/albert_xxl_1x.sh - name: Warning if: github.event.schedule != '0 21 * * 6' @@ -192,7 +192,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -204,7 +204,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} trl: name: Test TRL integration @@ -223,7 +223,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -235,7 +235,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_trl.sh sentence-transformers: name: Test Sentence Transformers integration @@ -263,7 +263,7 @@ jobs: path: sentence-transformers - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -275,5 +275,5 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash optimum-habana/tests/ci/sentence_transformers.sh diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml index 88a37aa1b2..c5b7dbbb2c 100644 --- a/.github/workflows/slow_tests_gaudi2.yml +++ b/.github/workflows/slow_tests_gaudi2.yml @@ -17,7 +17,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -30,7 +30,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/example_diff_tests.sh stable-diffusion: name: Test Stable Diffusion @@ -43,7 +43,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -59,8 +59,8 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ - /bin/bash tests/ci/slow_tests_diffusers.sh + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ + /bin/bash tests/ci/slow_tests_diffusers.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} deepspeed: name: Test DeepSpeed models if: ${{ !cancelled() && (success() || failure()) }} @@ -72,7 +72,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -88,7 +88,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_deepspeed.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} fsdp: name: Test FSDP models @@ -101,7 +101,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -117,7 +117,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ make slow_tests_fsdp TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} multi-card: name: Test multi-card models @@ -130,7 +130,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -146,7 +146,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_8x.sh ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} single-card: name: Test single-card models @@ -160,7 +160,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -177,7 +177,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_1x.sh text-generation: name: Test text-generation example @@ -192,7 +192,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -208,7 +208,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} trl: name: Test TRL integration @@ -221,7 +221,7 @@ jobs: uses: actions/checkout@v2 - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -237,7 +237,7 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash tests/ci/slow_tests_trl.sh sentence-transformers: name: Test Sentence Transformers integration @@ -258,7 +258,7 @@ jobs: path: sentence-transformers - name: Pull image run: | - docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + docker pull vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - name: Run tests run: | docker run \ @@ -274,5 +274,5 @@ jobs: --cap-add=sys_nice \ --net=host \ --ipc=host \ - vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest \ + vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest \ /bin/bash optimum-habana/tests/ci/sentence_transformers.sh diff --git a/Makefile b/Makefile index 854197d214..8065ba4b69 100644 --- a/Makefile +++ b/Makefile @@ -93,12 +93,12 @@ slow_tests_8x: test_installs # Run DeepSpeed non-regression tests slow_tests_deepspeed: test_installs - python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 python -m pytest tests/test_examples.py -v -s -k "deepspeed" slow_tests_diffusers: test_installs python -m pip install -r examples/stable-diffusion/requirements.txt - python -m pytest tests/test_diffusers.py -v -s -k "test_textual_inversion" + python -m pytest tests/test_diffusers.py -v -s -k "textual_inversion" python -m pip install peft==0.7.0 python -m pytest tests/test_diffusers.py -v -s -k "test_train_text_to_image_" python -m pytest tests/test_diffusers.py -v -s -k "test_train_controlnet" @@ -107,8 +107,9 @@ slow_tests_diffusers: test_installs # Run text-generation non-regression tests slow_tests_text_generation_example: test_installs + python -m pip install -r examples/text-generation/requirements_awq.txt BUILD_CUDA_EXT=0 python -m pip install -vvv --no-build-isolation git+https://github.com/HabanaAI/AutoGPTQ.git - python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN) # Run image-to-text non-regression tests @@ -120,6 +121,11 @@ slow_tests_openclip_vqa_example: test_installs python -m pip install -r examples/visual-question-answering/openclip_requirements.txt python -m pytest tests/test_openclip_vqa.py +# Run video comprehension tests +slow_tests_video_llava_example: test_installs + python -m pip install -r examples/video-comprehension/requirements.txt + python -m pytest tests/test_video_llava.py + slow_tests_fsdp: test_installs python -m pytest tests/test_fsdp_examples.py -v -s --token $(TOKEN) diff --git a/README.md b/README.md index e44ca5430c..d59d9cc932 100644 --- a/README.md +++ b/README.md @@ -25,24 +25,30 @@ limitations under the License. # Optimum for IntelĀ® GaudiĀ® Accelerators -Optimum for Intel Gaudi - a.k.a. `optimum-habana` - is the interface between the Transformers and Diffusers libraries and [Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html). -It provides a set of tools enabling easy model loading, training and inference on single- and multi-HPU settings for different downstream tasks. -The list of officially validated models and tasks is available [here](https://github.com/huggingface/optimum-habana#validated-models). Users can try other of the thousands of Hugging Face models on Intel Gaudi accelerators and tasks with only few changes. +Optimum for Intel Gaudi - a.k.a. `optimum-habana` - is the interface between the Transformers and Diffusers libraries and +[Intel Gaudi AI Accelerators (HPU)](https://docs.habana.ai/en/latest/index.html). It provides a set of tools enabling easy +model loading, training and inference on single- and multi-HPU settings for different downstream tasks. The list of officially +validated models and tasks is available [here](https://github.com/huggingface/optimum-habana#validated-models). Users can +try other of the thousands of Hugging Face models on Intel Gaudi accelerators and tasks with only few changes. ## What are Intel Gaudi AI Accelerators (HPUs)? HPUs offer fast model training and inference as well as a great price-performance ratio. -Check out [this blog post about BLOOM inference](https://huggingface.co/blog/habana-gaudi-2-bloom) and [this post benchmarking Intel Gaudi 2 and NVIDIA A100 GPUs for BridgeTower training](https://huggingface.co/blog/bridgetower) for concrete examples. +Check out [this blog post about BLOOM inference](https://huggingface.co/blog/habana-gaudi-2-bloom) and +[this post benchmarking Intel Gaudi 2 and NVIDIA A100 GPUs for BridgeTower training](https://huggingface.co/blog/bridgetower) +for concrete examples. ## Gaudi Setup Please refer to the Intel Gaudi AI Accelerator official [installation guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html). -> Tests should be run in a Docker container based on Intel Gaudi Docker images. -> -> The current version has been validated for SynapseAI 1.19. +> [!NOTE] +> Tests should be run in a Docker container based on Intel Gaudi's official images. Instructions to +> obtain the latest containers from the Intel Gaudi Vault are available +> [here](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html#use-intel-gaudi-containers). +> The current Optimum for Intel Gaudi has been validated with Intel Gaudi v1.20 stack. ## Install the library and get example scripts @@ -50,18 +56,18 @@ Please refer to the Intel Gaudi AI Accelerator official [installation guide](htt ### Option 1: Use the latest stable release To install the latest stable release of this package ->```bash ->pip install --upgrade-strategy eager optimum[habana] ->``` +```bash +pip install --upgrade-strategy eager optimum[habana] +``` The `--upgrade-strategy eager` option is needed to ensure `optimum-habana` is upgraded to the latest stable release. To use the example associated with the latest stable release, run: -> ``` -> git clone https://github.com/huggingface/optimum-habana -> cd optimum-habana && git checkout v1.15.0 -> ``` -> with `v1.15.0` the version number of this release. +```bash +git clone https://github.com/huggingface/optimum-habana +cd optimum-habana && git checkout v1.16.0 +``` +with `v1.16.0` being the latest Optimum for Intel Gaudi release version. ### Option 2: Use the latest main branch under development @@ -74,7 +80,8 @@ git clone https://github.com/huggingface/optimum-habana ### Option 3: Use the `transformers_future` branch to have the latest changes from Transformers -The `transformers_future` branch is regularly updated with the latest changes from the main branches of Optimum Habana and Transformers. This enables you to try out new Transformers features that have not been merged into the main branch yet. +The `transformers_future` branch is regularly updated with the latest changes from the main branches of Optimum for Intel Gaudi +and Transformers. This enables you to try out new Transformers features that have not been merged into the main branch yet. > [!WARNING] > The `transformers_future` branch may have some regressions or bugs and may be less stable than the main branch. @@ -84,34 +91,40 @@ pip install git+https://github.com/huggingface/optimum-habana.git@transformers_f git clone -b transformers_future https://github.com/huggingface/optimum-habana ``` -## Install dependencies +## Install Dependencies To use DeepSpeed on HPUs, you also need to run the following command: ->```bash ->pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 ->``` +```bash +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 +``` To install the requirements for every example: ->```bash ->cd ->pip install -r requirements.txt ->``` - +```bash +cd +pip install -r requirements.txt +``` ## How to use it? -### Quick Start - -Optimum for Intel Gaudi was designed with one goal in mind: **to make training and inference straightforward for Transformers and Diffusers users, while fully leveraging the power of Intel Gaudi AI Accelerators**. +Optimum for Intel Gaudi was designed with one goal in mind: **to make training and inference straightforward for Transformers +and Diffusers users, while fully leveraging the power of Intel Gaudi AI Accelerators**. -#### Transformers Interface +### Transformers Interface There are two main classes one needs to know: -- [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer): the trainer class that takes care of compiling and distributing the model to run on HPUs, and performing training and evaluation. -- [GaudiConfig](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config): the class that enables to configure Habana Mixed Precision and to decide whether optimized operators and optimizers should be used or not. -The [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer) is very similar to the [Transformers Trainer](https://huggingface.co/docs/transformers/main_classes/trainer), and adapting a script using the Trainer to make it work with Intel Gaudi accelerators will mostly consist in simply swapping the `Trainer` class for the `GaudiTrainer` one. -That's how most of the [example scripts](https://github.com/huggingface/optimum-habana/tree/main/examples) were adapted from their [original counterparts](https://github.com/huggingface/transformers/tree/main/examples/pytorch). +- [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer): the trainer class that takes care of + compiling and distributing the model to run on HPUs, and performing training and evaluation. + +- [GaudiConfig](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config): the class that enables to configure + Gaudi Mixed Precision and to decide whether optimized operators and optimizers should be used or not. + +The [GaudiTrainer](https://huggingface.co/docs/optimum/habana/package_reference/trainer) is very similar to the +[Transformers Trainer](https://huggingface.co/docs/transformers/main_classes/trainer), and adapting a script using the Trainer to +make it work with Intel Gaudi accelerators will mostly consist in simply swapping the `Trainer` class for the `GaudiTrainer` one. + +That's how most of the [example scripts](https://github.com/huggingface/optimum-habana/tree/main/examples) were adapted from their +[original counterparts](https://github.com/huggingface/transformers/tree/main/examples/pytorch). Here is an example: ```diff @@ -141,12 +154,17 @@ Here is an example: ) ``` -where `gaudi_config_name` is the name of a model from the [Hub](https://huggingface.co/Habana) (Intel Gaudi configurations are stored in model repositories) or a path to a local Intel Gaudi configuration file (you can see [here](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) how to write your own). +where `gaudi_config_name` is the name of a model from the [Hub](https://huggingface.co/Habana) (Intel Gaudi configurations +are stored in model repositories) or a path to a local Intel Gaudi configuration file (you can see +[here](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) how to write your own). -#### Diffusers Interface +### Diffusers Interface -You can generate images from prompts using Stable Diffusion on Intel Gaudi using the [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline) class and the [`GaudiDDIMScheduler`] which have been both optimized for HPUs. Here is how to use them and the differences with the Diffusers library: +You can generate images from prompts using Stable Diffusion on Intel Gaudi using the +[`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline) class and the +[`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler) +class which have been both optimized for HPUs. Here is how to use them and the differences with the Diffusers library: ```diff - from diffusers import DDIMScheduler, StableDiffusionPipeline @@ -167,7 +185,7 @@ model_name = "CompVis/stable-diffusion-v1-4" + gaudi_config="Habana/stable-diffusion", ) -outputs = generator( +outputs = pipeline( ["An image of a squirrel in Picasso style"], num_images_per_prompt=16, + batch_size=4, @@ -177,9 +195,14 @@ outputs = generator( ## Important Note on Pytorch 2.5 Performance Degradation -With the upgrade to PyTorch 2.5, users may experience some performance degradation due to changes in the handling of FP16/BF16 inputs. The note from PyTorch 2.5 states: +With the upgrade to PyTorch 2.5, users may experience some performance degradation due to changes in the handling of FP16/BF16 inputs. +The note from PyTorch 2.5 states: -"A naive SDPA math backend, when using FP16/BF16 inputs, can accumulate significant numerical errors due to the usage of low-precision intermediate buffers. To mitigate this issue, the default behavior now involves upcasting FP16/BF16 inputs to FP32. Computations are performed in FP32/TF32, and the final FP32 results are then downcasted back to FP16/BF16. This will improve numerical accuracy of the final output for the math backend with FP16/BF16 inputs, but increases memory usages and may cause the performance regressions in the math backend as computations shift from FP16/BF16 BMM to FP32/TF32 BMM/Matmul." +"A naive SDPA math backend, when using FP16/BF16 inputs, can accumulate significant numerical errors due to the usage of low-precision +intermediate buffers. To mitigate this issue, the default behavior now involves upcasting FP16/BF16 inputs to FP32. Computations are performed +in FP32/TF32, and the final FP32 results are then downcasted back to FP16/BF16. This will improve numerical accuracy of the final output for +the math backend with FP16/BF16 inputs, but increases memory usages and may cause the performance regressions in the math backend as computations +shift from FP16/BF16 BMM to FP32/TF32 BMM/Matmul." For scenarios where reduced-precision reductions are preferred for speed, they can be enabled with the following setting: ```python @@ -200,108 +223,110 @@ Check out [the documentation of Optimum for Intel Gaudi](https://huggingface.co/ The following model architectures, tasks and device distributions have been validated for Optimum for Intel Gaudi: +> [!NOTE] > In the tables below, :heavy_check_mark: means single-card, multi-card and DeepSpeed have all been validated. -- Transformers: -
+### Transformers: -| Architecture | Training | Inference |
Tasks
| -|--------------|:--------:|:---------:|:-----------------------| +| Architecture | Training | Inference | Tasks | +|:-------------|:--------:|:---------:|:------| | BERT | :heavy_check_mark: | :heavy_check_mark: |
  • [text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification)
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text feature extraction](https://github.com/huggingface/optimum-habana/tree/main/examples/text-feature-extraction)
  • | | RoBERTa | :heavy_check_mark: | :heavy_check_mark: |
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • | | ALBERT | :heavy_check_mark: | :heavy_check_mark: |
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • | | DistilBERT |:heavy_check_mark: | :heavy_check_mark: |
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • | | GPT2 | :heavy_check_mark: | :heavy_check_mark: |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| BLOOM(Z) | |
  • DeepSpeed
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| StarCoder / StarCoder2 | :heavy_check_mark: |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| GPT-J |
  • DeepSpeed
  • |
  • Single card
  • DeepSpeed
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| GPT-Neo | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| GPT-NeoX |
  • DeepSpeed
  • |
  • DeepSpeed
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| OPT | |
  • DeepSpeed
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| BLOOM(Z) | |
  • DeepSpeed
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| StarCoder / StarCoder2 | :heavy_check_mark: |
  • Single-card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| GPT-J |
  • DeepSpeed
  • |
  • Single card
  • DeepSpeed
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| GPT-Neo | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| GPT-NeoX |
  • DeepSpeed
  • |
  • DeepSpeed
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| OPT | |
  • DeepSpeed
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | Llama 2 / CodeLlama / Llama 3 / Llama Guard / Granite | :heavy_check_mark: | :heavy_check_mark: |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering)
  • [text classification](https://github.com/huggingface/optimum-habana/tree/main/examples/text-classification) (Llama Guard)
  • | -| StableLM | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Falcon |
  • LoRA
  • | :heavy_check_mark: |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| CodeGen | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| MPT | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Mistral | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Phi | :heavy_check_mark: |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Mixtral | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Persimmon | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Qwen2 |
  • Single card
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Qwen2-MoE | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Gemma | :heavy_check_mark: |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| StableLM | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Falcon |
  • LoRA
  • | :heavy_check_mark: |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| CodeGen | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| MPT | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Mistral | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Phi | :heavy_check_mark: |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Mixtral | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Persimmon | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Qwen2 |
  • Single card
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Qwen2-MoE | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Gemma | :heavy_check_mark: |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | Gemma2 | | :heavy_check_mark: |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| XGLM | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Cohere | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| XGLM | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Cohere | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | T5 / Flan T5 | :heavy_check_mark: | :heavy_check_mark: |
  • [summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)
  • [translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)
  • | -| BART | |
  • Single card
  • |
  • [summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)
  • [translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)
  • | +| BART | |
  • Single card
  • |
  • [summarization](https://github.com/huggingface/optimum-habana/tree/main/examples/summarization)
  • [translation](https://github.com/huggingface/optimum-habana/tree/main/examples/translation)
  • [question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/question-answering#fine-tuning-t5-on-squad20)
  • | | ViT | :heavy_check_mark: | :heavy_check_mark: |
  • [image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)
  • | | Swin | :heavy_check_mark: | :heavy_check_mark: |
  • [image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)
  • | | Wav2Vec2 | :heavy_check_mark: | :heavy_check_mark: |
  • [audio classification](https://github.com/huggingface/optimum-habana/tree/main/examples/audio-classification)
  • [speech recognition](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition)
  • | | Whisper | :heavy_check_mark: | :heavy_check_mark: |
  • [speech recognition](https://github.com/huggingface/optimum-habana/tree/main/examples/speech-recognition)
  • | -| SpeechT5 | |
  • Single card
  • |
  • [text to speech](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-speech)
  • | +| SpeechT5 | |
  • Single card
  • |
  • [text to speech](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-speech)
  • | | CLIP | :heavy_check_mark: | :heavy_check_mark: |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | | BridgeTower | :heavy_check_mark: | :heavy_check_mark: |
  • [contrastive image-text training](https://github.com/huggingface/optimum-habana/tree/main/examples/contrastive-image-text)
  • | -| ESMFold | |
  • Single card
  • |
  • [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)
  • | -| Blip | |
  • Single card
  • |
  • [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | -| OWLViT | |
  • Single card
  • |
  • [zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)
  • | -| ClipSeg | |
  • Single card
  • |
  • [object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)
  • | -| Llava / Llava-next | |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | -| idefics2 |
  • LoRA
  • |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | -| Paligemma | |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | -| Segment Anything Model | |
  • Single card
  • |
  • [object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)
  • | -| VideoMAE | |
  • Single card
  • |
  • [Video classification](https://github.com/huggingface/optimum-habana/tree/main/examples/video-classification)
  • | -| TableTransformer | |
  • Single card
  • |
  • [table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)
  • | -| DETR | |
  • Single card
  • |
  • [object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)
  • | -| Mllama |
  • LoRA
  • | :heavy_check_mark: |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | -| MiniCPM3 | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| Baichuan2 |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| DeepSeek-V2 | | :heavy_check_mark: |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| ChatGLM |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -
    - -- Diffusers: - -
    - -| Architecture | Training | Inference | Tasks | -|------------------|:--------:|:--------------------:|:------| -| Stable Diffusion |
  • [textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)
  • [ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)
  • |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | -| Stable Diffusion XL |
  • [fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)
  • |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | -| Stable Diffusion Depth2img | |
  • Single card
  • |
  • [depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | -| LDM3D | |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | -| FLUX.1 |
  • [fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#dreambooth-lora-fine-tuning-with-flux1-dev)
  • |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | -| Text to Video | |
  • Single card
  • |
  • [text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)
  • | +| ESMFold | |
  • Single card
  • |
  • [protein folding](https://github.com/huggingface/optimum-habana/tree/main/examples/protein-folding)
  • | +| Blip | |
  • Single card
  • |
  • [visual question answering](https://github.com/huggingface/optimum-habana/tree/main/examples/visual-question-answering)
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| OWLViT | |
  • Single card
  • |
  • [zero shot object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/zero-shot-object-detection)
  • | +| ClipSeg | |
  • Single card
  • |
  • [object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)
  • | +| Llava / Llava-next | |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| idefics2 |
  • LoRA
  • |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| Paligemma | |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| Segment Anything Model | |
  • Single card
  • |
  • [object segmentation](https://github.com/huggingface/optimum-habana/tree/main/examples/object-segementation)
  • | +| VideoMAE | |
  • Single card
  • |
  • [Video classification](https://github.com/huggingface/optimum-habana/tree/main/examples/video-classification)
  • | +| TableTransformer | |
  • Single card
  • |
  • [table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)
  • | +| DETR | |
  • Single card
  • |
  • [object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)
  • | +| Mllama |
  • LoRA
  • | :heavy_check_mark: |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| MiniCPM3 | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Baichuan2 |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| DeepSeek-V2 | :heavy_check_mark: | :heavy_check_mark: |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| DeepSeek-V3 | | :heavy_check_mark: |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| ChatGLM |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Qwen2-VL | |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| VideoLLaVA | |
  • Single card
  • |
  • [Video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)
  • |
    -- PyTorch Image Models/TIMM: -
    +### Diffusers: | Architecture | Training | Inference | Tasks | -|---------------------|:--------:|:---------:|:------| -| FastViT | |
  • Single card
  • |
  • [image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)
  • | +|:--------------------|:--------:|:---------:|:------| +| Stable Diffusion | :heavy_check_mark: | :heavy_check_mark: |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion)
  • [image-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-based-image-to-image)
  • | +| Stable Diffusion XL | :heavy_check_mark: | :heavy_check_mark: |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-xl-sdxl)
  • [image-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-xl-refiner)
  • | +| Stable Diffusion Depth2img | |
  • Single card
  • |
  • [depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#depth-to-image-generation)
  • | +| Stable Diffusion 3 | |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#stable-diffusion-3-sd3)
  • | +| LDM3D | |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#latent-diffusion-model-for-3d-ldm3d)
  • | +| FLUX.1 |
  • LoRA
  • |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#flux1)
  • [image-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#flux1-image-to-image)
  • | +| Text to Video | |
  • Single card
  • |
  • [text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)
  • | +| Image to Video | |
  • Single card
  • |
  • [image-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#image-to-video-generation)
  • | +| i2vgen-xl | |
  • Single card
  • |
  • [image-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#I2vgen-xl)
  • | + +### PyTorch Image Models/TIMM: -
    - -- TRL: +| Architecture | Training | Inference | Tasks | +|:--------------------|:--------:|:---------:|:------| +| FastViT | |
  • Single card
  • |
  • [image classification](https://github.com/huggingface/optimum-habana/tree/main/examples/image-classification)
  • | -
    +### TRL: | Architecture | Training | Inference | Tasks | -|------------------|:--------:|:--------------------:|:-----------------------------------------------------------------------------------------------| -| Llama 2 | :heavy_check_mark: | |
  • [DPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)
  • | -| Llama 2 | :heavy_check_mark: | |
  • [PPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)
  • | -| Stable Diffusion | :heavy_check_mark: | |
  • [DDPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl)
  • | +|:-----------------|:--------:|:--------------------:|:-----------------------------------------------------------------------------------------------| +| Llama 2 | :heavy_check_mark: | |
  • [DPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl#dpo-pipeline)
  • | +| Llama 2 | :heavy_check_mark: | |
  • [PPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl#ppo-pipeline)
  • | +| Stable Diffusion | :heavy_check_mark: | |
  • [DDPO Pipeline](https://github.com/huggingface/optimum-habana/tree/main/examples/trl#ddpo-pipeline)
  • | -
    - -Other models and tasks supported by the Transformers and Diffusers libraries may also work. You can refer to this [section](https://github.com/huggingface/optimum-habana#how-to-use-it) for using them with Optimum for Intel Gaudi. In addition, [this page](https://github.com/huggingface/optimum-habana/tree/main/examples) explains how to modify any [example](https://github.com/huggingface/transformers/tree/main/examples/pytorch) from the Transformers library to make it work with Optimum for Intel Gaudi. +Other models and tasks supported by the Transformers and Diffusers libraries may also work. You can refer to this [section](https://github.com/huggingface/optimum-habana#how-to-use-it) +for using them with Optimum for Intel Gaudi. In addition, [this page](https://github.com/huggingface/optimum-habana/tree/main/examples) explains how to modify any +[example](https://github.com/huggingface/transformers/tree/main/examples/pytorch) from the Transformers library to make it work with Optimum for Intel Gaudi. If you find any issues while using those, please open an issue or a pull request. -After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness. +After training your model, feel free to submit it to the Intel [leaderboard](https://huggingface.co/spaces/Intel/powered_by_intel_llm_leaderboard) which is designed +to evaluate, score, and rank open-source LLMs that have been pre-trained or fine-tuned on Intel Hardwares. Models submitted to the leaderboard will be evaluated on +the Intel Developer Cloud. The evaluation platform consists of Gaudi Accelerators and Xeon CPUs running benchmarks from the Eleuther AI Language Model Evaluation Harness. + +The list of validated models through continuous integration tests is posted [here](https://github.com/huggingface/optimum-habana/tree/main/tests/Habana_Validated_Models.md) ## Development diff --git a/conftest.py b/conftest.py index 71cb6bb7ca..1ce796e44f 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,78 @@ +import json +import logging +import os +import sys +from pathlib import Path + +import pytest + + +BASELINE_DIRECTORY = Path(__file__).parent.resolve() / Path("tests") / Path("baselines") / Path("fixture") + + +def walk_path(path: Path): + """ + Taken from https://stackoverflow.com/a/76236680 + + Path.walk() is not available until python 3.12 + """ + subdirs = [d for d in path.iterdir() if d.is_dir()] + files = [f for f in path.iterdir() if f.is_file()] + yield path, subdirs, files + for s in subdirs: + yield from walk_path(s) + + +class Baseline: + def __init__(self, session): + self.rebase = session.config.option.rebase + self.references = {} + + if BASELINE_DIRECTORY.exists(): + for root, dirs, files in walk_path(BASELINE_DIRECTORY): + for name in files: + with (root / name).open() as f: + self.references.update(json.load(f)) + + def get_reference(self, addr, context=[]): + reference = self.references.setdefault(addr, {}) + for c in context: + reference = reference.setdefault(c, {}) + return reference + + def finalize(self): + if self.rebase: + # aggregate refs by test file + refsbyfile = {} + for case, ref in self.references.items(): + key = case.split("::")[0] + reffile = BASELINE_DIRECTORY / Path(key).with_suffix(".json") + refsbyfile.setdefault(reffile, {})[case] = ref + + # dump aggregated refs into their own files + for reffile, refs in refsbyfile.items(): + reffile.parent.mkdir(parents=True, exist_ok=True) + with reffile.open("w+") as f: + json.dump(refs, f, indent=2, sort_keys=True) + + +class BaselineRequest: + def __init__(self, request): + self.baseline = request.session.stash["baseline"] + self.addr = request.node.nodeid + + def assertRef(self, compare, context=[], **kwargs): + reference = self.baseline.get_reference(self.addr, context) + if self.baseline.rebase: + reference.update(**kwargs) + + for key, actual in kwargs.items(): + ref = reference.get(key, None) + logging.getLogger().info(f"{'.'.join(context + [key])}:actual = {actual}") + logging.getLogger().info(f"{'.'.join(context + [key])}:ref = {ref}") + assert compare(actual, ref) + + class Secret: """ Taken from: https://stackoverflow.com/a/67393351 @@ -15,11 +90,54 @@ def __str___(self): def pytest_addoption(parser): parser.addoption("--token", action="store", default=None) + parser.addoption("--rebase", action="store_true", help="rebase baseline references from current run") + parser.addoption("--device", action="store", default=None) + + +@pytest.fixture +def token(request): + return Secret(request.config.option.token) + + +def pytest_sessionstart(session): + session.stash["baseline"] = Baseline(session) + + # User command-line option takes highest priority + if session.config.option.device is not None: + device = str(session.config.option.device).lower() + # User GAUDI2_CI environment variable takes second priority for backwards compatibility + elif "GAUDI2_CI" in os.environ: + device = "gaudi2" if os.environ["GAUDI2_CI"] == "1" else "gaudi1" + # Try to automatically detect it + else: + from optimum.habana.utils import get_device_name + + device = get_device_name() + + # optimum.habana.utils.get_device_name() returns `gaudi` for G1 + if "gaudi" == device: + # use "gaudi1" since this is used in tests, baselines, etc. + device = "gaudi1" + + from tests import utils + + utils.OH_DEVICE_CONTEXT = device + session.config.stash["device-context"] = device + + # WA: delete the imported top-level tests module so we don't overshadow + # tests/transformers/tests module. + # This fixes python -m pytest tests/transformers/tests/models/ -s -v + del sys.modules["tests"] + + +def pytest_report_header(config): + return [f"device context: {config.stash['device-context']}"] + + +def pytest_sessionfinish(session): + session.stash["baseline"].finalize() -def pytest_generate_tests(metafunc): - # This is called for every test. Only get/set command line arguments - # if the argument is specified in the list of test "fixturenames". - option_value = Secret(metafunc.config.option.token) - if "token" in metafunc.fixturenames: - metafunc.parametrize("token", [option_value]) +@pytest.fixture +def baseline(request): + return BaselineRequest(request) diff --git a/docs/Dockerfile b/docs/Dockerfile index 060b7413dc..ead30b7412 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest ARG commit_sha ARG clone_url diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 51d6dadf0f..a7cb1f1e92 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -105,10 +105,13 @@ In the tables below, āœ… means single-card, multi-card and DeepSpeed have all be | TableTransformer | |
  • Single card
  • |
  • [table object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/table-detection)
  • | | DETR | |
  • Single card
  • |
  • [object detection](https://github.com/huggingface/optimum-habana/tree/main/examples/object-detection)
  • | | Mllama |
  • LoRA
  • |āœ… |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | +| Video-LLaVA | |
  • Single card
  • |
  • [video comprehension](https://github.com/huggingface/optimum-habana/tree/main/examples/video-comprehension)
  • | | MiniCPM3 | |
  • Single card
  • |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | Baichuan2 |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | -| DeepSeek-V2 | | āœ… |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| DeepSeek-V2 | āœ… | āœ… |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| DeepSeek-V3 | | āœ… |
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | | ChatGLM |
  • DeepSpeed
  • |
  • Single card
  • |
  • [language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)
  • [text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
  • | +| Qwen2-VL | |
  • Single card
  • |
  • [image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)
  • | - Diffusers @@ -119,7 +122,8 @@ In the tables below, āœ… means single-card, multi-card and DeepSpeed have all be | Stable Diffusion Depth2img | |
  • Single card
  • |
  • [depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | | LDM3D | |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | | FLUX.1 |
  • [fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#dreambooth-lora-fine-tuning-with-flux1-dev)
  • |
  • Single card
  • |
  • [text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)
  • | -| Text to Video | |
  • Single card
  • |
  • [text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)
  • | +| Text to Video | |
  • Single card
  • |
  • [text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#text-to-video-generation)
  • | +| i2vgen-xl | |
  • Single card
  • |
  • [image-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion#I2vgen-xl)
  • | - PyTorch Image Models/TIMM: diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index fa54c4446e..6b39fa1084 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -24,7 +24,7 @@ python -m pip install --upgrade-strategy eager optimum[habana] To use MicrosoftĀ® DeepSpeed with Intel Gaudi devices, you also need to run the following command: ```bash -python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 +python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 ``` To ensure that you are installing the correct Intel Gaudi Software, please run the `hl-smi` command to confirm the software version diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index c882de2629..57d0bf90cb 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -32,12 +32,12 @@ platform for deep learning and follow the steps to start and connect to the node ## Docker Setup Now that you have access to the node, you will use the latest Intel Gaudi AI Accelerator docker image by executing the docker run command which will -automatically download and run the docker. At the time of writing this guide, latest Gaudi docker version was 1.19.0: +automatically download and run the docker. At the time of writing this guide, latest Gaudi docker version was 1.20.0: ```bash -release=1.19.0 +release=1.20.0 os=ubuntu22.04 -torch=2.5.1 +torch=2.6.0 docker_image=vault.habana.ai/gaudi-docker/$release/$os/habanalabs/pytorch-installer-$torch:latest ``` @@ -65,11 +65,11 @@ docker run -itd \ ## Optimum for Intel Gaudi Setup Check latest release of Optimum for Intel Gaudi [here](https://github.com/huggingface/optimum-habana/releases). -At the time of writing this guide, latest Optimum for Intel Gaudi release version was v1.15.0, which is paired with Intel Gaudi Software release -version 1.19.0. Install Optimum for Intel Gaudi as follows: +At the time of writing this guide, latest Optimum for Intel Gaudi release version was v1.16.0, which is paired with Intel Gaudi Software release +version 1.20.0. Install Optimum for Intel Gaudi as follows: ```bash -git clone -b v1.15.0 https://github.com/huggingface/optimum-habana +git clone -b v1.16.0 https://github.com/huggingface/optimum-habana pip install ./optimum-habana ``` @@ -115,7 +115,7 @@ MicrosoftĀ® DeepSpeed. Gaudi-specific fork of the library is maintained by Intel To install the library compatible with the same Gaudi software release stack, use: ```bash -pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 ``` With DeepSpeed successfully installed we can now run a distributed GPT-2 inference on an 8 HPU system as follows: diff --git a/docs/source/usage_guides/deepspeed.mdx b/docs/source/usage_guides/deepspeed.mdx index f6617e92ce..6fc34f2261 100644 --- a/docs/source/usage_guides/deepspeed.mdx +++ b/docs/source/usage_guides/deepspeed.mdx @@ -32,7 +32,7 @@ You can find more information about DeepSpeed Gaudi integration [here](https://d To use DeepSpeed on Gaudi, you need to install Optimum for Intel Gaudi and [DeepSpeed fork for Intel Gaudi](https://github.com/HabanaAI/DeepSpeed) with: ```bash pip install optimum[habana] -pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 ``` @@ -79,7 +79,7 @@ It is strongly advised to read [this section](https://huggingface.co/docs/transf -Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.19.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Intel. +Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.20.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Intel. The [Transformers documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed#configuration) explains how to write a configuration from scratch very well. A more complete description of all configuration possibilities is available [here](https://www.deepspeed.ai/docs/config-json/). diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md index 2fe6d5abd9..c8dd7b126c 100644 --- a/examples/audio-classification/README.md +++ b/examples/audio-classification/README.md @@ -27,6 +27,9 @@ First, you should install the requirements: pip install -r requirements.txt ``` +> [!NOTE] +> Please add the flags ENABLE_LB_BUNDLE_ALL_COMPUTE_MME=0 and ENABLE_EXPERIMENTAL_FLAGS=1 for facebook/wav2vec2-base stability issues on gaudi3. Please note this is a workaround for release 1.20 only. + ## Single-HPU The following command shows how to fine-tune [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on the šŸ—£ļø [Keyword Spotting subset](https://huggingface.co/datasets/superb#ks) of the SUPERB dataset on a single HPU. @@ -58,7 +61,8 @@ python run_audio_classification.py \ --throughput_warmup_steps 3 \ --sdp_on_bf16 \ --bf16 \ - --trust_remote_code True + --trust_remote_code True \ + --attn_implementation sdpa ``` On a single HPU, this script should run in ~13 minutes and yield an accuracy of **97.96%**. @@ -98,7 +102,8 @@ PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \ --bf16 \ --trust_remote_code True \ --torch_compile \ - --torch_compile_backend hpu_backend + --torch_compile_backend hpu_backend \ + --attn_implementation sdpa ``` On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.49%**. @@ -107,52 +112,6 @@ On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.4 > If you get an error reporting unused parameters in the model, you can specify `--ddp_find_unused_parameters True`. Using this parameter might affect the training speed. - -## DeepSpeed - -> You need to install DeepSpeed with: -> ```bash -> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 -> ``` - -DeepSpeed can be used with almost the same command as for a multi-card run: -- `use_mpi` should be replaced by `use_deepspeed`, -- an additional `--deepspeed path_to_my_deepspeed config` argument should be provided, for instance `--deepspeed ../../tests/configs/deepspeed_zero_2.json`. - -For example: -```bash -PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_audio_classification.py \ - --model_name_or_path facebook/wav2vec2-base \ - --dataset_name common_language \ - --audio_column_name audio \ - --label_column_name language \ - --output_dir /tmp/wav2vec2-base-lang-id \ - --overwrite_output_dir \ - --remove_unused_columns False \ - --do_train \ - --do_eval \ - --learning_rate 3e-4 \ - --max_length_seconds 8 \ - --attention_mask False \ - --warmup_ratio 0.1 \ - --num_train_epochs 10 \ - --per_device_train_batch_size 16 \ - --per_device_eval_batch_size 32 \ - --seed 0 \ - --use_habana \ - --use_lazy_mode False\ - --gaudi_config_name Habana/wav2vec2 \ - --throughput_warmup_steps 3 \ - --deepspeed ../../tests/configs/deepspeed_zero_2.json \ - --trust_remote_code True -``` - -[The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana. - -> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it. - - ## Inference To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc... diff --git a/examples/audio-classification/requirements.txt b/examples/audio-classification/requirements.txt index 720a5a4abc..bae36f7451 100644 --- a/examples/audio-classification/requirements.txt +++ b/examples/audio-classification/requirements.txt @@ -1,3 +1,4 @@ datasets>=1.14.0 evaluate +numba==0.60.0 librosa diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py index 6defd566d3..fbcd67ea73 100644 --- a/examples/audio-classification/run_audio_classification.py +++ b/examples/audio-classification/run_audio_classification.py @@ -47,7 +47,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") @@ -177,6 +177,33 @@ class ModelArguments: default=False, metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."}, ) + use_flash_attention: bool = field( + default=False, metadata={"help": "Whether to use Habana flash attention for fine-tuning"} + ) + flash_attention_recompute: bool = field( + default=False, + metadata={ + "help": "Whether to enable recompute in Habana flash attention for fine-tuning." + " It is applicable only when use_flash_attention is True." + }, + ) + flash_attention_fast_softmax: bool = field( + default=False, + metadata={ + "help": "Whether to use fast softmax for Habana flash attention." + " It is applicable only when use_flash_attention is True." + }, + ) + + def __post_init__(self): + if self.use_flash_attention: + os.environ["USE_FLASH_ATTENTION"] = "1" + if self.flash_attention_recompute: + assert self.use_flash_attention, "flash_attention_recompute is set, but use_flash_attention is not" + os.environ["FLASH_ATTENTION_RECOMPUTE"] = "1" + if self.flash_attention_fast_softmax: + assert self.use_flash_attention, "flash_attention_fast_softmax is set, but use_flash_attention is not" + os.environ["FLASH_ATTENTION_FAST_SOFTMAX"] = "1" def main(): @@ -364,6 +391,7 @@ def compute_metrics(eval_pred): revision=model_args.model_revision, token=model_args.token, trust_remote_code=model_args.trust_remote_code, + attn_implementation=training_args.attn_implementation, ) model = AutoModelForAudioClassification.from_pretrained( model_args.model_name_or_path, diff --git a/examples/contrastive-image-text/README.md b/examples/contrastive-image-text/README.md index c0aa57ac41..def6d74ec0 100644 --- a/examples/contrastive-image-text/README.md +++ b/examples/contrastive-image-text/README.md @@ -163,61 +163,8 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_clip.py \ ### DeepSpeed -Run the following command for training with DeepSpeed: - -```bash -PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 \ -python3 ../gaudi_spawn.py --world_size 8 --use_deepspeed run_clip.py \ - --output_dir=/tmp/clip_roberta \ - --model_name_or_path=./clip-roberta \ - --data_dir $PWD/data \ - --dataset_name ydshieh/coco_dataset_script \ - --dataset_config_name 2017 \ - --image_column image_path \ - --caption_column caption \ - --remove_unused_columns=False \ - --do_train --do_eval \ - --mediapipe_dataloader \ - --per_device_train_batch_size="64" \ - --per_device_eval_batch_size="64" \ - --learning_rate="5e-5" --warmup_steps="0" --weight_decay 0.1 \ - --overwrite_output_dir \ - --use_habana \ - --use_lazy_mode=False \ - --gaudi_config_name="Habana/clip" \ - --throughput_warmup_steps=30 \ - --save_strategy="no" \ - --dataloader_num_workers=2 \ - --use_hpu_graphs \ - --max_steps=100 \ - --torch_compile_backend=hpu_backend \ - --torch_compile \ - --logging_nan_inf_filter \ - --trust_remote_code \ - --deepspeed - -``` - -You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana. -Here is a DeepSpeed configuration you can use to train your models on Gaudi: -```json -{ - "steps_per_print": 64, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "bf16": { - "enabled": true - }, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 2, - "overlap_comm": false, - "reduce_scatter": false, - "contiguous_gradients": false - } -} -``` +You can check the [DeepSpeed](https://github.com/huggingface/optimum-habana/tree/main/examples#deepspeed) section in Optimum Habana examples for how to run DeepSpeed. +You can also look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana. ## BridgeTower @@ -244,7 +191,6 @@ python ../gaudi_spawn.py --use_mpi --world_size 8 run_bridgetower.py \ --logging_steps 10 \ --dataloader_num_workers 1 \ --mediapipe_dataloader \ - --distribution_strategy fast_ddp \ --trust_remote_code \ --sdp_on_bf16 ``` diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py index 5964b2cdcc..9d470bc350 100644 --- a/examples/contrastive-image-text/run_bridgetower.py +++ b/examples/contrastive-image-text/run_bridgetower.py @@ -59,7 +59,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py index fc3bb4886e..d10eb84496 100644 --- a/examples/contrastive-image-text/run_clip.py +++ b/examples/contrastive-image-text/run_clip.py @@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/gaudi_spawn.py b/examples/gaudi_spawn.py index f282809a31..6817ca0565 100644 --- a/examples/gaudi_spawn.py +++ b/examples/gaudi_spawn.py @@ -84,7 +84,7 @@ def main(): if not is_deepspeed_available(): raise ImportError( "--use_deepspeed requires deepspeed: `pip install" - " git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0`." + " git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0`." ) # Patch sys.argv diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py index bc45087f9e..2d4a980f0b 100644 --- a/examples/image-classification/run_image_classification.py +++ b/examples/image-classification/run_image_classification.py @@ -64,7 +64,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md index e4dbb05472..e41f1a6617 100644 --- a/examples/image-to-text/README.md +++ b/examples/image-to-text/README.md @@ -17,111 +17,12 @@ limitations under the License. # Image to Text Examples This directory contains a script that showcases how to perform image to text generation on IntelĀ® GaudiĀ® AI Accelerators. -## Single-HPU inference +Habana FusedSDPA is a fused and optimized implementation of torch.nn.functional.scaled_dot_product_attention() for Gaudi. For more details, refer to [Gaudi online documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html?highlight=fusedsdpa#using-fused-scaled-dot-product-attention-fusedsdpa). We optimized many models with FusedSDPA implementation as in [optimum/habana/transformers/models](https://github.com/huggingface/optimum-habana/tree/main/optimum/habana/transformers/models). If a model is not optimized with FusedSDPA, it uses [SDPA implementation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html). -Models that have been validated: - - [nlpconnect/vit-gpt2-image-captioning](https://huggingface.co/nlpconnect/vit-gpt2-image-captioning) - - [Salesforce/blip-image-captioning-large](https://huggingface.co/Salesforce/blip-image-captioning-large) - - [Salesforce/blip-image-captioning-base](https://huggingface.co/Salesforce/blip-image-captioning-base) - - [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) - - [llava-hf/llava-1.5-13b-hf](https://huggingface.co/llava-hf/llava-1.5-13b-hf) - - [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) - - [llava-hf/llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) - - [llava-hf/llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) - - [llava-hf/llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) - - [llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf) - - [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) - - [meta-llama/Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) - - [meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct) - - [tiiuae/falcon-11B-vlm](https://huggingface.co/tiiuae/falcon-11B-vlm) - - [google/paligemma-3b-mix-224](https://huggingface.co/google/paligemma-3b-mix-224) +## Inference with mixed-precision (BF16) -### Inference with BF16 - -To run Salesforce/blip-image-captioning-large inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path Salesforce/blip-image-captioning-large \ - --image_path "https://ankur3107.github.io/assets/images/image-captioning-example.png" \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run Llava-1.5-7b inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path llava-hf/llava-1.5-7b-hf \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run Llava-1.5-13b inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path llava-hf/llava-1.5-13b-hf \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run Llava-v1.6-mistral-7b inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run Llava-v1.6-vicuna-13b inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run Llava-hf/llava-v1.6-34b-hf inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-34b-hf \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run google/paligemma-3b-mix-224 inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path google/paligemma-3b-mix-224 \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command: -```bash -python3 run_pipeline.py \ - --model_name_or_path llava-hf/llama3-llava-next-8b-hf \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run idefics2 inference, use the following command: - -```bash -python3 run_pipeline.py \ - --model_name_or_path HuggingFaceM4/idefics2-8b \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -To run mllama inference using reduced precision in the SDPA, use the following command: +### Single card inference with BF16 +To run Llama inference with SDPA, use the following command: ```bash python3 run_pipeline.py \ @@ -130,55 +31,30 @@ python3 run_pipeline.py \ --bf16 \ --sdp_on_bf16 ``` +> SDPA may introduce [reduced precison](https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-reduction-for-fp16-and-bf16-in-scaled-dot-product-attention-sdpa) -### Inference with FP8 -Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. -More information on enabling FP8 in SynapseAI is available here: -https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html +### Multi-cards inference with BF16 -Here is an example to measure the tensor quantization statistics on Llava-1.5-7b: -```bash -QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ - --model_name_or_path llava-hf/llava-1.5-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -Here is an example to quantize the model based on previous measurements for Llava-1.5-7b: +Use the following commands to run Llama-3.2-90B-Vision-Instruct BF16 inference with FusedSDPA on 8 HPUs: ```bash -QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \ - --model_name_or_path llava-hf/llava-1.5-7b-hf \ +PT_HPU_ENABLE_LAZY_COLLECTIVES=true python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ + --model_name_or_path meta-llama/Llama-3.2-90B-Vision-Instruct \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ --bf16 \ - --sdp_on_bf16 + --use_flash_attention \ + --flash_attention_recompute ``` +## Inference with FP8 -Here is an example to measure the tensor quantization statistics on Llava-v1.6-mistral-7b: -```bash -QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` - -Here is an example to quantize the model based on previous measurements for Llava-v1.6-mistral-7b: -```bash -QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --sdp_on_bf16 -``` +Inference with FP8 precision is enabled using [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/index.html?highlight=inc), which provides model measurement and quantization capabilities in PyTorch. +More information on enabling FP8 in SynapseAI is available here: +[Run Inference Using FP8](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=fp8) -Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b: +### Single card inference with FP8 +Here is an example to measure the tensor quantization statistics on Llava-v1.6-vicuna-13b with SDPA: ```bash QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ @@ -188,7 +64,7 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --sdp_on_bf16 ``` -Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b: +Here is an example to quantize the model based on previous measurements for Llava-v1.6-vicuna-13b with SDPA: ```bash QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-vicuna-13b-hf \ @@ -198,25 +74,10 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python r --sdp_on_bf16 ``` -### Inference with FusedSDPA - -Habana FusedSDPA is a fused and optimized implementation of torch.nn.functional.scaled_dot_product_attention() for Gaudi. For more details, refer to [Gaudi online documentation](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_PyTorch_Models.html?highlight=fusedsdpa#using-fused-scaled-dot-product-attention-fusedsdpa). - -Use the following command to run Llava-1.5-7b BF16 inference with FusedSDPA -```bash -python3 run_pipeline.py \ - --model_name_or_path llava-hf/llava-1.5-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --use_flash_attention \ - --flash_attention_recompute -``` - - -Use the following command to run Llava-v1.6-mistral-7b BF16 inference with FusedSDPA +### Multi-cards inference with FP8 +Here is an example of measuring the tensor quantization statistics on Llava-v1.6-mistral-7b with FusedSDPA on 8 HPUs: ```bash -python3 run_pipeline.py \ +QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ @@ -225,12 +86,9 @@ python3 run_pipeline.py \ --flash_attention_recompute ``` - -Use the following commands to run Llava-v1.6-mistral-7b FP8 inference with FusedSDPA - -Here is an example of measuring the tensor quantization statistics on Llava-v1.6-mistral-7b: +Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b with FusedSDPA on 8 HPUs: ```bash -QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ +QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ --image_path "https://llava-vl.github.io/static/images/view.jpg" \ --use_hpu_graphs \ @@ -239,88 +97,8 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \ --flash_attention_recompute ``` -Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b: -```bash -QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --use_flash_attention \ - --flash_attention_recompute -``` ## LORA Finetune -To run LoRA finetuning, you can use `run_image2text_lora_finetune.py`. -Here are single-/multi-device command examples for HuggingFaceM4/idefics2-8b. - -```bash -python3 run_image2text_lora_finetune.py \ - --model_name_or_path HuggingFaceM4/idefics2-8b \ - --dataset_name nielsr/docvqa_1200_examples \ - --bf16 True \ - --output_dir ./model_lora_llama \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 2 \ - --gradient_accumulation_steps 8 \ - --weight_decay 0.01 \ - --logging_steps 25 \ - --eval_strategy "no" \ - --save_strategy "no" \ - --learning_rate 5e-5 \ - --warmup_steps 50 \ - --lr_scheduler_type "constant" \ - --input_column_names 'image' 'query' \ - --output_column_names 'answers' \ - --remove_unused_columns False \ - --do_train \ - --do_eval \ - --use_habana \ - --use_lazy_mode \ - --lora_rank=8 \ - --lora_alpha=8 \ - --lora_dropout=0.1 \ - --max_seq_length=512 \ - --use_hpu_graphs_for_inference \ - --low_cpu_mem_usage True \ - --lora_target_modules '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$' -``` - -```bash -python3 ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_image2text_lora_finetune.py \ - --model_name_or_path HuggingFaceM4/idefics2-8b \ - --dataset_name nielsr/docvqa_1200_examples \ - --bf16 True \ - --output_dir ./model_lora_llama \ - --num_train_epochs 1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 2 \ - --gradient_accumulation_steps 8 \ - --weight_decay 0.01 \ - --logging_steps 25 \ - --eval_strategy "no" \ - --save_strategy "no" \ - --learning_rate 5e-5 \ - --warmup_steps 50 \ - --lr_scheduler_type "constant" \ - --input_column_names 'image' 'query' \ - --output_column_names 'answers' \ - --remove_unused_columns False \ - --do_train \ - --do_eval \ - --use_habana \ - --use_lazy_mode \ - --lora_rank=8 \ - --lora_alpha=8 \ - --lora_dropout=0.1 \ - --max_seq_length=512 \ - --use_hpu_graphs_for_inference \ - --low_cpu_mem_usage True \ - --lora_target_modules '".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$"' -``` - Here are single-/multi-device command examples for meta-llama/Llama-3.2-11B-Vision-Instruct. ```bash @@ -390,54 +168,8 @@ python3 ../gaudi_spawn.py \ --lora_target_modules '".*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$"' ``` -## Multi-HPU inference +The single card training command for llava-hf/llava-1.5-7b-hf is similar. -### BF16 Inference with FusedSDPA on 8 HPUs - -Use the following commands to run Llava-v1.6-mistral-7b BF16 inference with FusedSDPA on 8 HPUs: -```bash -python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --use_flash_attention \ - --flash_attention_recompute -``` - -Use the following commands to run Llama-3.2-90B-Vision-Instruct BF16 inference with FusedSDPA on 8 HPUs: -```bash -PT_HPU_ENABLE_LAZY_COLLECTIVES=true python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ - --model_name_or_path meta-llama/Llama-3.2-90B-Vision-Instruct \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --use_flash_attention \ - --flash_attention_recompute -``` - - -### FP8 Inference with FusedSDPA on 8 HPUs - -Use the following commands to run Llava-v1.6-mistral-7b FP8 inference with FusedSDPA on 8 HPUs. -Here is an example of measuring the tensor quantization statistics on Llava-v1.6-mistral-7b on 8 HPUs: -```bash -QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --use_flash_attention \ - --flash_attention_recompute -``` - -Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b on 8 HPUs: -```bash -QUANT_CONFIG=./quantization_config/maxabs_quant_scale_format_const.json python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_pipeline.py \ - --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \ - --image_path "https://llava-vl.github.io/static/images/view.jpg" \ - --use_hpu_graphs \ - --bf16 \ - --use_flash_attention \ - --flash_attention_recompute -``` +> For different models, please adjust training parameters and `lora_target_modules`. Such as replace `lora_target_modules` +> with below for HuggingFaceM4/idefics2-8b. +> '".*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$"' diff --git a/examples/image-to-text/run_image2text_lora_finetune.py b/examples/image-to-text/run_image2text_lora_finetune.py index ded60e6d52..74020cd67d 100644 --- a/examples/image-to-text/run_image2text_lora_finetune.py +++ b/examples/image-to-text/run_image2text_lora_finetune.py @@ -298,7 +298,58 @@ def __call__(self, examples): return batch -def eval(processor, model, dataset, batch_size, use_lazy_mode, use_hpu_graphs, max_seq_length): +class LLavaDataCollator: + def __init__(self, processor, max_seq_length): + self.processor = processor + + num_image_tokens = (self.processor.image_processor.crop_size["height"] // self.processor.patch_size) * ( + self.processor.image_processor.crop_size["width"] // self.processor.patch_size + ) + 1 + if self.processor.vision_feature_select_strategy == "default": + num_image_tokens -= 1 + + # text length + image length + self.max_seq_length = max_seq_length + num_image_tokens + + def __call__(self, examples): + texts = [] + images = [] + + keys = list(examples[0].keys()) + if not all(key in ["image", "query", "answers"] for key in keys): + raise ValueError("Unsupported dataset format") + for example in examples: + image = example["image"] + question = example["query"]["en"] + answer = random.choice(example["answers"]) + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Answer briefly."}, + {"type": "image"}, + {"type": "text", "text": question}, + ], + }, + {"role": "assistant", "content": [{"type": "text", "text": answer}]}, + ] + text = self.processor.apply_chat_template(messages, add_generation_prompt=False) + texts.append(text.strip()) + images.append(image) + + batch = self.processor( + images, texts, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_seq_length + ) + + labels = batch["input_ids"].clone() + if self.processor.tokenizer.pad_token_id is not None: + labels[labels == self.processor.tokenizer.pad_token_id] = -100 + batch["labels"] = labels + + return batch + + +def eval(processor, model, dataset, batch_size, use_lazy_mode, use_hpu_graphs, max_seq_length, model_type): from tqdm import tqdm answers_unique = [] @@ -307,7 +358,6 @@ def eval(processor, model, dataset, batch_size, use_lazy_mode, use_hpu_graphs, m for i in tqdm(range(0, len(dataset), batch_size)): examples = dataset[i : i + batch_size] answers_unique.extend(examples["answers"]) - images = [[im] for im in examples["image"]] texts = [] for q in examples["query"]: messages = [ @@ -322,14 +372,31 @@ def eval(processor, model, dataset, batch_size, use_lazy_mode, use_hpu_graphs, m ] text = processor.apply_chat_template(messages, add_generation_prompt=True) texts.append(text.strip()) - inputs = processor( - text=texts, - images=images, - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=max_seq_length, - ) + + if model_type is not None and model_type == "llava": + images = [] + for im in examples["image"]: + images.append(im) + + inputs = processor( + images, + texts, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=max_seq_length, + padding_side="left", + ) + else: + images = [[im] for im in examples["image"]] + inputs = processor( + text=texts, + images=images, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=max_seq_length, + ) inputs = {k: v.to("hpu") for k, v in inputs.items()} generated_ids = model.generate( **inputs, max_new_tokens=64, ignore_eos=False, lazy_mode=use_lazy_mode, hpu_graphs=use_hpu_graphs @@ -346,6 +413,22 @@ def eval(processor, model, dataset, batch_size, use_lazy_mode, use_hpu_graphs, m return anls +def find_all_linear_names(model): + cls = torch.nn.Linear + lora_module_names = set() + multimodal_keywords = ["mm_projector", "vision_tower", "vision_resampler"] + for name, module in model.named_modules(): + if any(mm_keyword in name for mm_keyword in multimodal_keywords): + continue + if isinstance(module, cls): + names = name.split(".") + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + if "lm_head" in lora_module_names: # needed for 16-bit + lora_module_names.remove("lm_head") + return list(lora_module_names) + + def main(): parser = HfArgumentParser((ModelArguments, DataArguments, GaudiTrainingArguments, FinetuneArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): @@ -380,7 +463,7 @@ def main(): do_image_splitting=model_args.do_image_splitting, padding_side="right", ) - setattr(processor.image_processor, "pad_to_longest_edge", True) + config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, @@ -395,7 +478,13 @@ def main(): else: raise ValueError("Please provide value for model_name_or_path or config_name.") - # Load model + if config.model_type == "llava": + setattr(processor, "patch_size", config.vision_config.patch_size) + setattr(processor, "vision_feature_select_strategy", config.vision_feature_select_strategy) + else: + setattr(processor.image_processor, "pad_to_longest_edge", True) + + # Load model if model_args.model_name_or_path: model_dtype = torch.bfloat16 if training_args.bf16 else None model = AutoModelForVision2Seq.from_pretrained( @@ -413,11 +502,16 @@ def main(): else: raise ValueError("Must provide model_name_or_path to load a pretrained CausalLM model.") + if finetune_args.lora_target_modules is None: + target_modules = find_all_linear_names(model) + else: + target_modules = finetune_args.lora_target_modules + lora_config = LoraConfig( r=finetune_args.lora_rank, lora_alpha=finetune_args.lora_alpha, lora_dropout=finetune_args.lora_dropout, - target_modules=finetune_args.lora_target_modules, + target_modules=target_modules, init_lora_weights="gaussian", ) model = get_peft_model(model, lora_config) @@ -456,15 +550,21 @@ def main(): if col not in (data_args.input_column_names + data_args.output_column_names) ] ) - if hasattr(config, "image_token_id"): - # idefics - image_token_id = config.image_token_id - elif hasattr(config, "image_token_index"): - # mllama - image_token_id = config.image_token_index + if config.model_type == "llava": + data_collator = LLavaDataCollator(processor, max_seq_length=data_args.max_seq_length) else: - raise ValueError("Please provide value for image_token_id") - data_collator = MyDataCollator(processor, max_seq_length=data_args.max_seq_length, image_token_id=image_token_id) + if hasattr(config, "image_token_id"): + # idefics + image_token_id = config.image_token_id + elif hasattr(config, "image_token_index"): + # mllama + image_token_id = config.image_token_index + else: + raise ValueError("Please provide value for image_token_id") + + data_collator = MyDataCollator( + processor, max_seq_length=data_args.max_seq_length, image_token_id=image_token_id + ) gaudi_config = GaudiConfig() gaudi_config.use_fused_adam = True @@ -509,14 +609,29 @@ def main(): } ] text = processor.apply_chat_template(messages, add_generation_prompt=True) - inputs = processor( - text=[text.strip()], - images=[image], - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=data_args.max_seq_length, - ) + + if config.model_type == "llava": + # don't expand image_token_id + setattr(processor, "patch_size", None) + setattr(processor, "vision_feature_select_strategy", None) + inputs = processor( + [image], + [text.strip()], + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=data_args.max_seq_length, + padding_side="left", + ) + else: + inputs = processor( + text=[text.strip()], + images=[image], + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=data_args.max_seq_length, + ) inputs = {k: v.to("hpu") for k, v in inputs.items()} generated_ids = model.generate( **inputs, @@ -543,6 +658,7 @@ def main(): use_lazy_mode=training_args.use_lazy_mode, use_hpu_graphs=training_args.use_hpu_graphs_for_inference, max_seq_length=data_args.max_seq_length, + model_type=config.model_type, ) eval_metrics = {"eval_accuracy": anls} trainer.log_metrics("eval", eval_metrics) diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py index 44eb8d575a..f75bde19c2 100644 --- a/examples/image-to-text/run_pipeline.py +++ b/examples/image-to-text/run_pipeline.py @@ -25,6 +25,10 @@ import torch from transformers import AutoConfig, AutoModelForVision2Seq, AutoProcessor, pipeline +from optimum.habana.utils import ( + set_seed, +) + logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -179,6 +183,23 @@ def main(): action="store_true", help="Allow PyTorch to use reduced precision in the SDPA math backend", ) + parser.add_argument( + "--max_input_tokens", + type=int, + default=None, + help="If > 0 then pad the input sequences to this specified length of tokens. will not apply truncate to avoid deleting the image tag", + ) + parser.add_argument( + "--do_sample", + action="store_true", + help="Whether to use sampling for generation.", + ) + parser.add_argument( + "--seed", + default=27, + type=int, + help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.", + ) args = parser.parse_args() @@ -192,14 +213,18 @@ def main(): os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") if args.world_size > 0: os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") + os.environ.setdefault("DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API", "1") from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi adapt_transformers_to_gaudi() + set_seed(args.seed) + config = AutoConfig.from_pretrained(args.model_name_or_path) model_type = config.model_type - if args.image_path is None and model_type in ["llava", "idefics2", "mllama"]: + + if args.image_path is None and model_type in ["llava", "idefics2", "mllama", "qwen2_vl"]: args.image_path = ["https://llava-vl.github.io/static/images/view.jpg"] elif args.image_path is None and model_type == "paligemma": args.image_path = [ @@ -210,8 +235,8 @@ def main(): "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" ] - if model_type in ["llava", "idefics2", "llava_next", "mllama", "paligemma"]: - processor = AutoProcessor.from_pretrained(args.model_name_or_path) + if model_type in ["llava", "idefics2", "llava_next", "mllama", "paligemma", "qwen2_vl"]: + processor = AutoProcessor.from_pretrained(args.model_name_or_path, padding_side="left") if args.prompt is None: if processor.chat_template is not None: conversation = [ @@ -289,6 +314,9 @@ def main(): generator = pipeline( "image-to-text", model=args.model_name_or_path, + config=args.model_name_or_path, + tokenizer=args.model_name_or_path, + image_processor=args.model_name_or_path, torch_dtype=model_dtype, device="hpu", ) @@ -308,6 +336,7 @@ def main(): "use_flash_attention": args.use_flash_attention, "flash_attention_recompute": args.flash_attention_recompute, "limit_hpu_graphs": args.limit_hpu_graphs, + "do_sample": args.do_sample, } if args.sdp_on_bf16: @@ -316,17 +345,27 @@ def main(): if args.use_kv_cache: generate_kwargs["use_cache"] = args.use_kv_cache + if model_type == "qwen2_vl": + generate_kwargs["use_cache"] = True + generate_kwargs["cache_implementation"] = "static" + if args.quant_config: generator.model = setup_quantization(generator.model, args) htcore.hpu_initialize(generator.model) # delete once pipeline integrate AutoProcessor as preprocess engine - if model_type in ["idefics2", "mllama", "paligemma"]: + # could use "image-text-to-text" pipeline in transformers 4.47 + + if model_type in ["idefics2", "mllama", "paligemma", "qwen2_vl", "llava", "llava_next"]: from transformers.image_utils import load_image def preprocess(self, image, prompt=None, timeout=None): + kwargs = {} + if args.max_input_tokens is not None and args.max_input_tokens > 0: + kwargs["max_length"] = args.max_input_tokens + kwargs["padding"] = "max_length" image = load_image(image, timeout=timeout) - model_inputs = processor(images=image, text=prompt, return_tensors=self.framework) + model_inputs = processor(images=image, text=prompt, return_tensors=self.framework, **kwargs) return model_inputs generator.__class__.preprocess = preprocess @@ -349,13 +388,18 @@ def preprocess(self, image, prompt=None, timeout=None): n_output_tokens = 0 for sequence in result: # We have to subtract the number of input tokens as they are part of the returned sequence - n_output_tokens += len(generator.tokenizer(sequence[0]["generated_text"]).input_ids) - n_input_tokens + # TODO this is not accurate, args.prompt contains flag like <|im_start|>, <|im_end|>, while generated_text does not contain it + # if it's text+image prompt, should use "image-text-to-text" pipeline after transformers 4.47 + if not args.ignore_eos: + n_output_tokens += len(generator.tokenizer(sequence[0]["generated_text"]).input_ids) - n_input_tokens + else: + n_output_tokens += args.max_new_tokens total_new_tokens_generated = args.n_iterations * n_output_tokens throughput = total_new_tokens_generated / duration logger.info(f"result = {result}") logger.info( - f"time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second" + f"time = {(end - start) * 1000 / args.n_iterations}ms, Throughput (including tokenization) = {throughput} tokens/second" ) # Store results if necessary diff --git a/examples/kubernetes/Chart.yaml b/examples/kubernetes/Chart.yaml index dc0400ccb0..d1c1778076 100644 --- a/examples/kubernetes/Chart.yaml +++ b/examples/kubernetes/Chart.yaml @@ -3,7 +3,7 @@ name: optimum-habana-example-chart description: This Helm chart deploys example jobs using Optimum for IntelĀ® GaudiĀ® Accelerators to a Kubernetes cluster. # Compatible Kubernetes versions -kubeVersion: 1.27-1.29 +kubeVersion: 1.27 - 1.29 # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. diff --git a/examples/kubernetes/Dockerfile b/examples/kubernetes/Dockerfile index 7ebfd93894..2264dfca57 100644 --- a/examples/kubernetes/Dockerfile +++ b/examples/kubernetes/Dockerfile @@ -1,7 +1,7 @@ -ARG GAUDI_SW_VER=1.19.0 +ARG GAUDI_SW_VER=1.20.0 ARG OS=ubuntu22.04 -ARG TORCH_VER=2.5.1 -ARG OPTIMUM_HABANA_VER=1.15.0 +ARG TORCH_VER=2.6.0 +ARG OPTIMUM_HABANA_VER=1.16.0 FROM vault.habana.ai/gaudi-docker/${GAUDI_SW_VER}/${OS}/habanalabs/pytorch-installer-${TORCH_VER}:latest AS optimum-habana diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md index 06f4f01d09..fe65d41482 100644 --- a/examples/kubernetes/README.md +++ b/examples/kubernetes/README.md @@ -43,12 +43,12 @@ Use the the following commands to build the containers: ```bash # Specify the Gaudi SW version, OS, and PyTorch version which will be used for the base container -export GAUDI_SW_VER=1.19.0 +export GAUDI_SW_VER=1.20.0 export OS=ubuntu22.04 -export TORCH_VER=2.5.1 +export TORCH_VER=2.6.0 # Specify the version of optimum-habana to install in the container -export OPTIMUM_HABANA_VER=1.15.0 +export OPTIMUM_HABANA_VER=1.16.0 git clone https://github.com/huggingface/optimum-habana.git diff --git a/examples/kubernetes/README.md.gotmpl b/examples/kubernetes/README.md.gotmpl index 431f8ad611..48f0af8259 100644 --- a/examples/kubernetes/README.md.gotmpl +++ b/examples/kubernetes/README.md.gotmpl @@ -43,12 +43,12 @@ Use the the following commands to build the containers: ```bash # Specify the Gaudi SW version, OS, and PyTorch version which will be used for the base container -export GAUDI_SW_VER=1.19.0 +export GAUDI_SW_VER=1.20.0 export OS=ubuntu22.04 -export TORCH_VER=2.5.1 +export TORCH_VER=2.6.0 # Specify the version of optimum-habana to install in the container -export OPTIMUM_HABANA_VER=1.15.0 +export OPTIMUM_HABANA_VER=1.16.0 git clone https://github.com/huggingface/optimum-habana.git diff --git a/examples/kubernetes/docker-compose.yaml b/examples/kubernetes/docker-compose.yaml index 6bdea75bbd..4ab69f1021 100644 --- a/examples/kubernetes/docker-compose.yaml +++ b/examples/kubernetes/docker-compose.yaml @@ -5,30 +5,30 @@ services: http_proxy: ${http_proxy:-""} https_proxy: ${https_proxy:-""} no_proxy: ${no_proxy:-""} - GAUDI_SW_VER: ${GAUDI_SW_VER:-1.19.0} + GAUDI_SW_VER: ${GAUDI_SW_VER:-1.20.0} OS: ${OS:-ubuntu22.04} - OPTIMUM_HABANA_VER: ${OPTIMUM_HABANA_VER:-1.15.0} - TORCH_VER: ${TORCH_VER:-2.5.1} + OPTIMUM_HABANA_VER: ${OPTIMUM_HABANA_VER:-1.16.0} + TORCH_VER: ${TORCH_VER:-2.6.0} REGISTRY: ${REGISTRY} REPO: ${REPO} context: . labels: - org.opencontainers.base.name: "vault.habana.ai/gaudi-docker/${GAUDI_SW_VER:-1.19.0}/${OS:-ubuntu22.04}/habanalabs/pytorch-installer-${TORCH_VER:-2.5.1}:latest" + org.opencontainers.base.name: "vault.habana.ai/gaudi-docker/${GAUDI_SW_VER:-1.20.0}/${OS:-ubuntu22.04}/habanalabs/pytorch-installer-${TORCH_VER:-2.6.0}:latest" org.opencontainers.image.title: "Optimum for IntelĀ® GaudiĀ® Accelerators" - org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0} + org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.20.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.16.0} command: > sh -c "python -c 'from optimum import habana; print(\"optimum-habana:\", habana.__version__)'" - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.20.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.16.0} pull_policy: always optimum-habana-examples: build: labels: - org.opencontainers.base.name: "${REGISTRY}/${REPO}:gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.15.0}" + org.opencontainers.base.name: "${REGISTRY}/${REPO}:gaudi-${GAUDI_SW_VER:-1.20.0}-optimum-habana-${OPTIMUM_HABANA_VER:-1.16.0}" org.opencontainers.image.title: "Optimum for IntelĀ® GaudiĀ® Accelerators Examples" - org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.15.0} + org.opencontainers.image.version: gaudi-${GAUDI_SW_VER:-1.20.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.16.0} target: optimum-habana-examples command: > sh -c "python -c 'from optimum import habana; print(\"optimum-habana:\", habana.__version__)'" extends: optimum-habana - image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.19.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.15.0} + image: ${REGISTRY}/${REPO}:b-${GITHUB_RUN_NUMBER:-0}-gaudi-${GAUDI_SW_VER:-1.20.0}-optimum-habana-examples-${OPTIMUM_HABANA_VER:-1.16.0} diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index 9ef27f9e73..6abf93e187 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -131,60 +131,6 @@ python ../gaudi_spawn.py \ This example has been validated with the following DeepSpeed ZeRO-2 config: https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_2.json -### Multi-card Training with Deepspeed (chatglm3-6b) -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_clm.py \ - --config_name THUDM/chatglm3-6b \ - --tokenizer_name THUDM/chatglm3-6b \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --per_device_train_batch_size 6 \ - --per_device_eval_batch_size 4 \ - --do_train \ - --do_eval \ - --deepspeed llama2_ds_zero3_config.json \ - --output_dir /tmp/test-clm \ - --gaudi_config_name Habana/gpt2 \ - --use_habana \ - --use_lazy_mode \ - --throughput_warmup_steps 3 \ - --bf16 \ - --block_size 1024 \ - --use_cache False \ - --overwrite_output_dir \ - --logging_first_step True \ - --logging_steps 20 -``` - -### Multi-card Training with Deepspeed (Baichuan2-13B-Chat) -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_clm.py \ - --config_name baichuan-inc/Baichuan2-13B-Chat \ - --tokenizer_name baichuan-inc/Baichuan2-13B-Chat \ - --dataset_name wikitext \ - --num_train_epochs 30 \ - --dataset_config_name wikitext-2-raw-v1 \ - --per_device_train_batch_size 2 \ - --per_device_eval_batch_size 2 \ - --do_train \ - --do_eval \ - --deepspeed llama2_ds_zero3_config.json \ - --output_dir /tmp/test-clm \ - --gaudi_config_name Habana/gpt2 \ - --use_habana \ - --use_lazy_mode \ - --throughput_warmup_steps 3 \ - --bf16 \ - --block_size 1024 \ - --use_cache False \ - --overwrite_output_dir \ - --logging_first_step True \ - --logging_steps 20 -``` - - ## Multi-Node Training with Deepspeed (GPT-NeoX) The following command triggers the fine-tuning of [GPT-NeoX-20B](https://huggingface.co/EleutherAI/gpt-neox-20b) on WikiText-2 with Deepspeed ZeRO-2. @@ -226,10 +172,11 @@ Following the RoBERTa paper, we use dynamic masking rather than static masking. converge slightly slower (over-fitting takes more epochs). -### Single-card Training +### Multi-card Training ```bash -python run_mlm.py \ +python ../gaudi_spawn.py \ + --world_size 8 --use_mpi run_mlm.py \ --model_name_or_path roberta-base \ --dataset_name wikitext \ --dataset_config_name wikitext-2-raw-v1 \ @@ -246,54 +193,12 @@ python run_mlm.py \ --bf16 ``` -To run on your own training and validation files, use the following command: - -```bash -python run_mlm.py \ - --model_name_or_path roberta-base \ - --train_file path_to_train_file \ - --validation_file path_to_validation_file \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 8 \ - --do_train \ - --do_eval \ - --output_dir /tmp/test-mlm \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --gaudi_config_name Habana/roberta-base \ - --throughput_warmup_steps 3 \ - --bf16 -``` - If your dataset is organized with one sample per line, you can use the `--line_by_line` flag (otherwise the script concatenates all texts and then splits them into blocks of the same length). **Note:** On HPU, you should use the flag `--pad_to_max_length` in conjunction with the `--line_by_line` flag to make sure all your batches have the same length. -### Multi-card Training - -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_mlm.py \ - --model_name_or_path roberta-base \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --per_device_train_batch_size 8 \ - --per_device_eval_batch_size 8 \ - --do_train \ - --do_eval \ - --output_dir /tmp/test-mlm \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --gaudi_config_name Habana/roberta-base \ - --throughput_warmup_steps 3 \ - --bf16 -``` - - ### Training in torch.compile mode RoBERTa-Large model training in [torch.compile](pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) mode is enabled by applying the following changes to your command, a) Set the following environment variables `PT_HPU_LAZY_MODE=0` and `PT_ENABLE_INT64_SUPPORT=1`. @@ -324,78 +229,6 @@ python run_clm.py \ --bf16 ``` - -## Using DeepSpeed - -Multi-card examples can be simply adapted to be run with DeepSpeed. Here is the CLM example with GPT2-XL: - -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_clm.py \ - --model_name_or_path gpt2-xl \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --per_device_train_batch_size 16 \ - --per_device_eval_batch_size 4 \ - --do_train \ - --do_eval \ - --learning_rate 4e-4 \ - --output_dir /tmp/test-clm \ - --gaudi_config_name Habana/gpt2 \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --gradient_checkpointing \ - --use_cache False \ - --throughput_warmup_steps 3 \ - --deepspeed path_to_my_deepspeed_config -``` - -You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana. -Here is a DeepSpeed configuration you can use to train your models on Gaudi: -```json -{ - "steps_per_print": 64, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "bf16": { - "enabled": true - }, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 2, - "overlap_comm": false, - "reduce_scatter": false, - "contiguous_gradients": false - } -} -``` - -Here is another example with Bloom-7B1: - -```bash -DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 PT_HPU_MAX_COMPOUND_OP_SYNC=1 PT_HPU_MAX_COMPOUND_OP_SIZE=1 python ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_clm.py \ - --model_name_or_path bigscience/bloom-7b1 \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --per_device_train_batch_size 8 \ - --do_train \ - --output_dir /tmp/test-clm \ - --gaudi_config_name Habana/roberta-base \ - --use_habana \ - --use_lazy_mode \ - --gradient_checkpointing \ - --use_cache False \ - --throughput_warmup_steps 3 \ - --save_strategy "no" \ - --learning_rate 1e-04 \ - --deepspeed path_to_my_deepspeed_config -``` -[This](https://github.com/huggingface/optimum-habana/blob/main/tests/configs/deepspeed_zero_3_gaudi1.json) is a DeepSpeed configuration you can use to train this model on Gaudi1. - - ## Inference To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc... @@ -456,141 +289,6 @@ python3 run_lora_clm.py \ --validation_split_percentage 4 \ --adam_epsilon 1e-08 ``` -- Single-card finetuning of Falcon-40B: -```bash -PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 run_lora_clm.py \ - --model_name_or_path tiiuae/falcon-40b \ - --dataset_name timdettmers/openassistant-guanaco \ - --bf16 True \ - --output_dir ./model_lora_falcon \ - --num_train_epochs 3 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 16 \ - --eval_strategy "no" \ - --save_strategy "no" \ - --learning_rate 3e-4 \ - --max_grad_norm 0.3 \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "constant" \ - --logging_steps 1 \ - --do_train \ - --use_habana \ - --use_lazy_mode \ - --pipelining_fwd_bwd \ - --throughput_warmup_steps 3 \ - --lora_rank=64 \ - --lora_alpha=16 \ - --lora_dropout=0.1 \ - --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \ - --dataset_concatenation \ - --max_seq_length 256 \ - --low_cpu_mem_usage True \ - --adam_epsilon 1e-08 \ - --do_eval \ - --validation_split_percentage 5 -``` - -- Multi-card finetuning of Llama1-7B: -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_lora_clm.py \ - --model_name_or_path huggyllama/llama-7b \ - --dataset_name tatsu-lab/alpaca \ - --bf16 True \ - --output_dir ./model_lora_llama_ddp \ - --num_train_epochs 3 \ - --per_device_train_batch_size 8 \ - --gradient_accumulation_steps 2 \ - --eval_strategy "no" \ - --save_strategy "no" \ - --learning_rate 3e-4 \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "constant" \ - --max_grad_norm 0.3 \ - --logging_steps 1 \ - --do_train \ - --do_eval \ - --use_habana \ - --use_lazy_mode \ - --throughput_warmup_steps 3 \ - --lora_rank=8 \ - --lora_alpha=16 \ - --lora_dropout=0.05 \ - --lora_target_modules "q_proj" "v_proj" \ - --dataset_concatenation \ - --max_seq_length 512 \ - --ddp_bucket_cap_mb 50 \ - --adam_epsilon 1e-08 \ - --validation_split_percentage 4 \ - --low_cpu_mem_usage True -``` - -- Multi-card finetuning of Llama2-7B with FP8: -```bash -PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_lora_clm.py \ - --model_name_or_path meta-llama/Llama-2-7b-hf \ - --dataset_name tatsu-lab/alpaca \ - --bf16 True \ - --output_dir ./model_lora_llama \ - --num_train_epochs 3 \ - --per_device_train_batch_size 16 \ - --gradient_accumulation_steps 1 \ - --eval_strategy "no" \ - --save_strategy "no" \ - --learning_rate 3e-4 \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "constant" \ - --max_grad_norm 0.3 \ - --logging_steps 20 \ - --do_train \ - --do_eval \ - --use_habana \ - --use_lazy_mode \ - --throughput_warmup_steps 18 \ - --lora_rank=8 \ - --lora_alpha=16 \ - --lora_dropout=0.05 \ - --lora_target_modules "q_proj" "v_proj" \ - --dataset_concatenation \ - --max_seq_length 512 \ - --ddp_bucket_cap_mb 50 \ - --adam_epsilon 1e-08 \ - --validation_split_percentage 10 \ - --low_cpu_mem_usage True \ - --pipelining_fwd_bwd \ - --fp8 True -``` - -- Multi-card finetuning of codegen-16B-mono: -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_lora_clm.py \ - --model_name_or_path Salesforce/codegen-16B-mono \ - --dataset_name b-mc2/sql-create-context \ - --sql_prompt \ - --bf16 True \ - --output_dir ./finetuned-models/codegen-finetune-on-sql-create-context-hpu8-lora8-bs4 \ - --num_train_epochs 5 \ - --per_device_train_batch_size 4 \ - --per_device_eval_batch_size 4 \ - --eval_strategy "no" \ - --save_strategy "no" \ - --learning_rate 1e-4 \ - --logging_steps 1 \ - --dataset_concatenation \ - --do_train \ - --use_habana \ - --use_lazy_mode \ - --throughput_warmup_steps 3 \ - --use_hpu_graphs_for_inference \ - --lora_target_modules "qkv_proj" \ - --lora_rank 8 \ - --do_eval \ - --validation_split_percentage 10 \ - --use_cache False -``` - Multi-card finetuning of gemma2 using chat template: ```bash @@ -740,43 +438,6 @@ python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \ --flash_attention_causal_mask True ``` -- Multi-card finetuning of Falcon-180B: - - Falcon-180B example command saves only the LoRA parameters at end - - For inference we need to merge the pretrained model and LoRA weights -```bash -PT_HPU_AUTOCAST_LOWER_PRECISION_OPS_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_lora_clm.py \ - --model_name_or_path tiiuae/falcon-180B \ - --dataset_name timdettmers/openassistant-guanaco \ - --bf16 True \ - --output_dir ./model_lora_falcon_ddp \ - --num_train_epochs 3 \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 16 \ - --eval_strategy "no" \ - --save_strategy "no" \ - --learning_rate 4e-4 \ - --max_grad_norm 0.3 \ - --warmup_ratio 0.03 \ - --lr_scheduler_type "constant" \ - --logging_steps 1 \ - --do_train \ - --use_habana \ - --use_lazy_mode \ - --pipelining_fwd_bwd \ - --throughput_warmup_steps 3 \ - --lora_rank=64 \ - --lora_alpha=16 \ - --lora_dropout=0.1 \ - --lora_target_modules "query_key_value" "dense" "dense_h_to_4h" "dense_4h_to_h" \ - --dataset_concatenation \ - --max_seq_length 256 \ - --adam_epsilon 1e-08 \ - --do_eval \ - --validation_split_percentage 5 \ - --deepspeed ds_falcon_180b_z3.json -``` Default `peft_type` is `lora`, you could enable adalora or ia3 using `--peft_type adalora` or `--peft_type ia3`, or enable llama-adapter for llama model using `--peft_type llama-adapter`, or enable ln-tuning using `--peft_type ln_tuning`, or enable vera using `--peft_type vera`. #### Custom Files @@ -824,7 +485,7 @@ The format of the text files (with extensions .text or .txt) is expected to be ### Prompt/Prefix/P-tuning To run prompt tuning finetuning, you can use `run_prompt_tuning_clm.py`. -Here are single-/multi-device command examples for Llama2-7B: +Here are single-card command examples for Llama2-7B: - single-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints": ```bash python3 run_prompt_tuning_clm.py \ @@ -844,25 +505,6 @@ python3 run_prompt_tuning_clm.py \ --use_lazy_mode ``` -- multi-card finetuning of meta-llama/Llama-2-7b-hf with dataset "ought/raft" and config "twitter_complaints": -```bash -python3 ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_prompt_tuning_clm.py \ - --model_name_or_path meta-llama/Llama-2-7b-hf \ - --output_dir prompt_tuning_out \ - --bf16 True \ - --report_to=none \ - --per_device_train_batch_size 1 \ - --per_device_eval_batch_size 1 \ - --gradient_accumulation_steps 1 \ - --low_cpu_mem_usage True \ - --logging_steps 1 \ - --do_train \ - --num_train_epochs 50 \ - --do_eval \ - --use_habana \ - --use_lazy_mode -``` Default `peft_type` is `prompt_tuning`, you could enable prefix-tuning or p-tuning using `--peft_type prefix_tuning` or `--peft_type p_tuning`. Use the prompt finetuned model for text-generation: @@ -935,7 +577,6 @@ We have added support for [Deepspeed Ulysses](https://github.com/microsoft/DeepS > This feature is still in beta version and may not work out of the box for all transformer model architectures and configurations. ```bash -HL_DS_DISTRIBUTED_ATTENTION_SEQ_DIM=1 \ python3 ../gaudi_spawn.py \ --world_size 8 --use_deepspeed run_lora_clm.py \ --model_name_or_path meta-llama/Llama-3.1-8B \ diff --git a/examples/language-modeling/llama3_ds_zero1_config.json b/examples/language-modeling/llama3_ds_zero1_config.json new file mode 100755 index 0000000000..50a1f46b7d --- /dev/null +++ b/examples/language-modeling/llama3_ds_zero1_config.json @@ -0,0 +1,20 @@ +{ + "steps_per_print": 64, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "bf16": { + "enabled": true + }, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 1, + "contiguous_gradients": false + }, + "timers": { + "throughput": { + "enabled": true, + "synchronized": false + } + } +} diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py index feac065364..bd86aacc23 100644 --- a/examples/language-modeling/run_clm.py +++ b/examples/language-modeling/run_clm.py @@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") @@ -156,6 +156,32 @@ class ModelArguments: ) }, ) + attn_softmax_bf16: bool = field( + default=False, + metadata={"help": ("Whether to run attention softmax layer in bf16 precision for fine-tuning.")}, + ) + use_flash_attention: bool = field( + default=False, + metadata={"help": ("Whether to use Habana flash attention for fine-tuning.")}, + ) + flash_attention_recompute: bool = field( + default=False, + metadata={ + "help": ( + "Whether to enable recompute in Habana flash attention for fine-tuning." + " It is applicable only when use_flash_attention is True." + ) + }, + ) + flash_attention_causal_mask: bool = field( + default=False, + metadata={ + "help": ( + "Whether to enable causal mask in Habana flash attention for fine-tuning." + " It is applicable only when use_flash_attention is True." + ) + }, + ) low_cpu_mem_usage: bool = field( default=False, metadata={ @@ -472,7 +498,7 @@ def main(): else: model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params") # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -482,6 +508,14 @@ def main(): if len(tokenizer) > embedding_size: model.resize_token_embeddings(len(tokenizer)) + # We need to add these fused kernels config + if model_args.attn_softmax_bf16: + model.generation_config.attn_softmax_bf16 = True + if model_args.use_flash_attention: + model.generation_config.use_flash_attention = True + model.generation_config.flash_attention_recompute = model_args.flash_attention_recompute + model.generation_config.flash_attention_causal_mask = model_args.flash_attention_causal_mask + # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py index 3ff7fbfd3a..29a5e5201a 100644 --- a/examples/language-modeling/run_lora_clm.py +++ b/examples/language-modeling/run_lora_clm.py @@ -70,7 +70,7 @@ def check_optimum_habana_min_version(*a, **b): logger = logging.getLogger(__name__) # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") @dataclass diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py index 2de43c910b..90c1580f37 100644 --- a/examples/language-modeling/run_mlm.py +++ b/examples/language-modeling/run_mlm.py @@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/language-modeling/run_multitask_prompt_tuning.py b/examples/language-modeling/run_multitask_prompt_tuning.py index 9f955db44e..10ecfd0e47 100644 --- a/examples/language-modeling/run_multitask_prompt_tuning.py +++ b/examples/language-modeling/run_multitask_prompt_tuning.py @@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risk. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py index 44ea542d14..3da61970dd 100644 --- a/examples/language-modeling/run_prompt_tuning_clm.py +++ b/examples/language-modeling/run_prompt_tuning_clm.py @@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/multi-node-training/EFA/Dockerfile b/examples/multi-node-training/EFA/Dockerfile index bc6f827164..8b83af7d9d 100644 --- a/examples/multi-node-training/EFA/Dockerfile +++ b/examples/multi-node-training/EFA/Dockerfile @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest # Installs pdsh and upgrade pip RUN apt-get update && apt-get install -y pdsh && \ @@ -19,7 +19,7 @@ RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \ # Installs Optimum Habana and Habana's fork of DeepSpeed RUN pip install optimum[habana] && \ - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \ chmod 600 ~/.ssh/id_rsa && \ diff --git a/examples/multi-node-training/GaudiNIC/.deepspeed_env b/examples/multi-node-training/GaudiNIC/.deepspeed_env new file mode 100644 index 0000000000..0fa8686f68 --- /dev/null +++ b/examples/multi-node-training/GaudiNIC/.deepspeed_env @@ -0,0 +1,5 @@ +GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so +HABANA_LOGS=/var/log/habana_logs/ +HABANA_SCAL_BIN_PATH=/opt/habanalabs/engines_fw +HABANA_PLUGINS_LIB_PATH=/opt/habanalabs/habana_plugins +DATA_LOADER_AEON_LIB_PATH=/usr/lib/habanalabs/libaeon.so diff --git a/examples/multi-node-training/GaudiNIC/Dockerfile b/examples/multi-node-training/GaudiNIC/Dockerfile index 5375a6fcc7..09a98e6bb9 100644 --- a/examples/multi-node-training/GaudiNIC/Dockerfile +++ b/examples/multi-node-training/GaudiNIC/Dockerfile @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +FROM vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest # Installs pdsh and upgrade pip RUN apt-get update && apt-get install -y pdsh && \ @@ -13,7 +13,7 @@ RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \ # Installs Optimum Habana and Habana's fork of DeepSpeed RUN pip install optimum[habana] && \ - pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 + pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \ chmod 600 ~/.ssh/id_rsa && \ diff --git a/examples/multi-node-training/README.md b/examples/multi-node-training/README.md index 0e40e616f8..bc0ba8fda0 100644 --- a/examples/multi-node-training/README.md +++ b/examples/multi-node-training/README.md @@ -111,6 +111,10 @@ env_variable_2_name=value ... ``` +You can find an example for GaudiNIC instances [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training/GaudiNIC/.deepspeed_env). + +> Note above environment variables refers to /etc/profile.d/habanalabs.sh inside docker, and should set only on GaudiNIC master node. + You can find an example for AWS instances [here](https://github.com/huggingface/optimum-habana/tree/main/examples/multi-node-training/EFA/.deepspeed_env). > Note that one should set `HCCL_OVER_OFI=1` and `LD_LIBRARY_PATH=/root/hccl_ofi_wrapper:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib` only on AWS DL1 instances. *These should not be used otherwise*. diff --git a/examples/object-detection/README.md b/examples/object-detection/README.md index aa82013326..0ce639dc9b 100644 --- a/examples/object-detection/README.md +++ b/examples/object-detection/README.md @@ -28,7 +28,3 @@ python3 run_example.py \ --bf16 \ --print_result ``` - -Models that have been validated: - - [facebook/detr-resnet-101](https://huggingface.co/facebook/detr-resnet-101) - - [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) \ No newline at end of file diff --git a/examples/object-segementation/README.md b/examples/object-segementation/README.md index 936180e4f2..2b8728eb56 100644 --- a/examples/object-segementation/README.md +++ b/examples/object-segementation/README.md @@ -30,8 +30,6 @@ python3 run_example.py \ --bf16 \ --print_result ``` -Models that have been validated: - - [clipseg-rd64-refined ](https://huggingface.co/CIDAS/clipseg-rd64-refined) ### Segment Anything Model @@ -45,7 +43,4 @@ python3 run_example_sam.py \ --use_hpu_graphs \ --bf16 \ --print_result -``` -Models that have been validated: - - [facebook/sam-vit-base](https://huggingface.co/facebook/sam-vit-base) - - [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) \ No newline at end of file +``` \ No newline at end of file diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py index 230d1c61e8..f4c2de6ef3 100644 --- a/examples/protein-folding/run_esmfold.py +++ b/examples/protein-folding/run_esmfold.py @@ -40,7 +40,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") def convert_outputs_to_pdb(outputs): diff --git a/examples/protein-folding/run_sequence_classification.py b/examples/protein-folding/run_sequence_classification.py index fa35d8b803..f382b1c571 100644 --- a/examples/protein-folding/run_sequence_classification.py +++ b/examples/protein-folding/run_sequence_classification.py @@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/examples/protein-folding/run_zero_shot_eval.py b/examples/protein-folding/run_zero_shot_eval.py index 7da135f080..b4717f3d37 100644 --- a/examples/protein-folding/run_zero_shot_eval.py +++ b/examples/protein-folding/run_zero_shot_eval.py @@ -36,7 +36,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") logging.basicConfig( diff --git a/examples/pytorch-image-models/README.md b/examples/pytorch-image-models/README.md index f1dc21ddf4..731e61d612 100644 --- a/examples/pytorch-image-models/README.md +++ b/examples/pytorch-image-models/README.md @@ -16,20 +16,7 @@ limitations under the License. # pyTorch-IMage-Models (TIMM) Examples with HPUs -This directory contains the scripts that showcases how to inference/fine-tune the TIMM models on intel's HPUs with the lazy/graph modes. We support the trainging for single/multiple HPU cards both two. Currently we support several most downloadable models from Hugging Face as below list. - -- [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k) -- [timm/resnet18.a1_in1k](https://huggingface.co/timm/resnet18.a1_in1k) -- [timm/resnet18.fb_swsl_ig1b_ft_in1k](https://huggingface.co/timm/resnet18.fb_swsl_ig1b_ft_in1k) -- [timm/wide_resnet50_2.racm_in1k](https://huggingface.co/timm/wide_resnet50_2.racm_in1k) -- [timm/efficientnet_b3.ra2_in1k](https://huggingface.co/timm/efficientnet_b3.ra2_in1k) -- [timm/efficientnet_lite0.ra_in1k](https://huggingface.co/timm/efficientnet_lite0.ra_in1k) -- [timm/efficientnet_b0.ra_in1k](https://huggingface.co/timm/efficientnet_b0.ra_in1k) -- [timm/nf_regnet_b1.ra2_in1k](https://huggingface.co/timm/nf_regnet_b1.ra2_in1k) -- [timm/mobilenetv3_large_100.ra_in1k](https://huggingface.co/timm/mobilenetv3_large_100.ra_in1k) -- [timm/tf_mobilenetv3_large_minimal_100.in1k](https://huggingface.co/timm/tf_mobilenetv3_large_minimal_100.in1k) -- [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k](https://huggingface.co/timm/vit_base_patch16_224.augreg2_in21k_ft_in1k) -- [timm/vgg19.tv_in1k](https://huggingface.co/timm/vgg19.tv_in1k) +This directory contains the scripts that showcase how to inference/fine-tune the TIMM models on Intel's HPUs with the lazy/graph modes. Training is supported for single/multiple HPU cards. Currently we can support first 10 most downloadable models from [Hugging Face timm link](https://huggingface.co/timm). In our example below for inference/training we will use [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k) as our testing model and same usage for other models. ## Requirements @@ -46,20 +33,6 @@ pip install . Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.co/datasets/johnowhitaker/imagenette2-320) and model with [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k) from Hugging Face. -### Training with HPU lazy mode - -```bash -python train_hpu_lazy.py \ - --data-dir ./ \ - --dataset hfds/johnowhitaker/imagenette2-320 \ - --device 'hpu' \ - --model resnet50.a1_in1k \ - --train-split train \ - --val-split train \ - --dataset-download -``` - -python train_hpu_lazy.py --data-dir='./' --dataset hfds/johnowhitaker/imagenette2-320 --device='hpu' --model resnet50.a1_in1k ### Training with HPU graph mode ```bash @@ -70,41 +43,13 @@ python train_hpu_graph.py \ --model resnet50.a1_in1k \ --train-split train \ --val-split train \ - --dataset-download + --dataset-download ``` -Here the results for lazy mode is shown below for example: - -```bash -Train: 0 [ 0/73 ( 1%)] Loss: 6.86 (6.86) Time: 9.575s, 13.37/s (9.575s, 13.37/s) LR: 1.000e-05 Data: 0.844 (0.844) -Train: 0 [ 50/73 ( 70%)] Loss: 6.77 (6.83) Time: 0.320s, 400.32/s (0.470s, 272.39/s) LR: 1.000e-05 Data: 0.217 (0.047) -Test: [ 0/30] Time: 6.593 (6.593) Loss: 6.723 ( 6.723) Acc@1: 0.000 ( 0.000) Acc@5: 0.000 ( 0.000) -Test: [ 30/30] Time: 3.856 (0.732) Loss: 6.615 ( 6.691) Acc@1: 0.000 ( 0.076) Acc@5: 1.176 ( 3.287) - -Train: 1 [ 0/73 ( 1%)] Loss: 6.69 (6.69) Time: 0.796s, 160.74/s (0.796s, 160.74/s) LR: 1.001e-02 Data: 0.685 (0.685) -Train: 1 [ 50/73 ( 70%)] Loss: 3.23 (3.76) Time: 0.160s, 798.85/s (0.148s, 863.22/s) LR: 1.001e-02 Data: 0.053 (0.051) -Test: [ 0/30] Time: 0.663 (0.663) Loss: 1.926 ( 1.926) Acc@1: 46.094 ( 46.094) Acc@5: 85.938 ( 85.938) -Test: [ 30/30] Time: 0.022 (0.126) Loss: 1.462 ( 1.867) Acc@1: 63.529 ( 39.261) Acc@5: 83.529 ( 85.096) - -``` - - ## Multi-HPU training Here we show how to fine-tune the [imagenette2-320 dataset](https://huggingface.co/datasets/johnowhitaker/imagenette2-320) and model with [timm/resnet50.a1_in1k](https://huggingface.co/timm/resnet50.a1_in1k) from Hugging Face. -### Training with HPU lazy mode -```bash -torchrun --nnodes 1 --nproc_per_node 2 \ - train_hpu_lazy.py \ - --data-dir ./ \ - --dataset hfds/johnowhitaker/imagenette2-320 \ - --device 'hpu' \ - --model resnet50.a1_in1k \ - --train-split train \ - --val-split train \ - --dataset-download -``` ### Training with HPU graph mode ```bash @@ -119,20 +64,6 @@ torchrun --nnodes 1 --nproc_per_node 2 \ --dataset-download ``` -Here the results for lazy mode is shown below for example: - -```bash -Train: 0 [ 0/36 ( 3%)] Loss: 6.88 (6.88) Time: 10.036s, 25.51/s (10.036s, 25.51/s) LR: 1.000e-05 Data: 0.762 (0.762) -Test: [ 0/15] Time: 7.796 (7.796) Loss: 6.915 ( 6.915) Acc@1: 0.000 ( 0.000) Acc@5: 0.000 ( 0.000) -Test: [ 15/15] Time: 1.915 (1.263) Loss: 6.847 ( 6.818) Acc@1: 0.000 ( 0.000) Acc@5: 0.000 ( 0.688) - -Train: 1 [ 0/36 ( 3%)] Loss: 6.84 (6.84) Time: 6.687s, 38.28/s (6.687s, 38.28/s) LR: 2.001e-02 Data: 0.701 (0.701) -Test: [ 0/15] Time: 1.315 (1.315) Loss: 2.463 ( 2.463) Acc@1: 14.062 ( 14.062) Acc@5: 48.828 ( 48.828) -Test: [ 15/15] Time: 0.020 (0.180) Loss: 1.812 ( 1.982) Acc@1: 52.326 ( 32.934) Acc@5: 66.279 ( 75.064) - -``` - - ## Single-HPU inference @@ -149,15 +80,6 @@ python inference.py \ --graph_mode ``` -### HPU with lazy mode -```bash -python inference.py \ - --data-dir='./' \ - --dataset hfds/johnowhitaker/imagenette2-320 \ - --device='hpu' \ - --model resnet50.a1_in1k \ - --split train -``` diff --git a/examples/pytorch-image-models/train_hpu_graph.py b/examples/pytorch-image-models/train_hpu_graph.py index 0bcfbe7295..c9d0974258 100755 --- a/examples/pytorch-image-models/train_hpu_graph.py +++ b/examples/pytorch-image-models/train_hpu_graph.py @@ -136,6 +136,12 @@ metavar="PATH", help="Load this checkpoint into model after initialization (default: none)", ) +group.add_argument( + "--save_checkpoint", + action="store_true", + default=False, + help="saving checkpoint for each epoch", +) group.add_argument( "--resume", default="", @@ -1048,17 +1054,18 @@ def main(): ] ) output_dir = utils.get_outdir(args.output if args.output else "./output/train", exp_name) - saver = utils.CheckpointSaver( - model=model, - optimizer=optimizer, - args=args, - model_ema=model_ema, - amp_scaler=loss_scaler, - checkpoint_dir=output_dir, - recovery_dir=output_dir, - decreasing=decreasing_metric, - max_history=args.checkpoint_hist, - ) + if args.save_checkpoint: + saver = utils.CheckpointSaver( + model=model, + optimizer=optimizer, + args=args, + model_ema=model_ema, + amp_scaler=loss_scaler, + checkpoint_dir=output_dir, + recovery_dir=output_dir, + decreasing=decreasing_metric, + max_history=args.checkpoint_hist, + ) with open(os.path.join(output_dir, "args.yaml"), "w") as f: f.write(args_text) @@ -1092,7 +1099,7 @@ def main(): if utils.is_primary(args): _logger.info( - f'Scheduled epochs: {num_epochs}. LR stepped per {"epoch" if lr_scheduler.t_in_epochs else "update"}.' + f"Scheduled epochs: {num_epochs}. LR stepped per {'epoch' if lr_scheduler.t_in_epochs else 'update'}." ) results = [] @@ -1324,7 +1331,7 @@ def _backward(_loss): if utils.is_primary(args): _logger.info( f"Train: {epoch} [{update_idx:>4d}/{updates_per_epoch} " - f"({100. * (update_idx + 1) / updates_per_epoch:>3.0f}%)] " + f"({100.0 * (update_idx + 1) / updates_per_epoch:>3.0f}%)] " f"Loss: {losses_m.val:#.3g} ({losses_m.avg:#.3g}) " f"Time: {update_time_m.val:.3f}s, {update_sample_count / update_time_m.val:>7.2f}/s " f"({update_time_m.avg:.3f}s, {update_sample_count / update_time_m.avg:>7.2f}/s) " diff --git a/examples/pytorch-image-models/train_hpu_lazy.py b/examples/pytorch-image-models/train_hpu_lazy.py index bca523c9b4..17f1dac0d9 100755 --- a/examples/pytorch-image-models/train_hpu_lazy.py +++ b/examples/pytorch-image-models/train_hpu_lazy.py @@ -138,6 +138,12 @@ metavar="PATH", help="Load this checkpoint into model after initialization (default: none)", ) +group.add_argument( + "--save_checkpoint", + action="store_true", + default=False, + help="saving checkpoint for each epoch", +) group.add_argument( "--resume", default="", @@ -1047,17 +1053,18 @@ def main(): ] ) output_dir = utils.get_outdir(args.output if args.output else "./output/train", exp_name) - saver = utils.CheckpointSaver( - model=model, - optimizer=optimizer, - args=args, - model_ema=model_ema, - amp_scaler=loss_scaler, - checkpoint_dir=output_dir, - recovery_dir=output_dir, - decreasing=decreasing_metric, - max_history=args.checkpoint_hist, - ) + if args.save_checkpoint: + saver = utils.CheckpointSaver( + model=model, + optimizer=optimizer, + args=args, + model_ema=model_ema, + amp_scaler=loss_scaler, + checkpoint_dir=output_dir, + recovery_dir=output_dir, + decreasing=decreasing_metric, + max_history=args.checkpoint_hist, + ) with open(os.path.join(output_dir, "args.yaml"), "w") as f: f.write(args_text) @@ -1091,7 +1098,7 @@ def main(): if utils.is_primary(args): _logger.info( - f'Scheduled epochs: {num_epochs}. LR stepped per {"epoch" if lr_scheduler.t_in_epochs else "update"}.' + f"Scheduled epochs: {num_epochs}. LR stepped per {'epoch' if lr_scheduler.t_in_epochs else 'update'}." ) results = [] @@ -1325,7 +1332,7 @@ def _backward(_loss): if utils.is_primary(args): _logger.info( f"Train: {epoch} [{update_idx:>4d}/{updates_per_epoch} " - f"({100. * (update_idx + 1) / updates_per_epoch:>3.0f}%)] " + f"({100.0 * (update_idx + 1) / updates_per_epoch:>3.0f}%)] " f"Loss: {losses_m.val:#.3g} ({losses_m.avg:#.3g}) " f"Time: {update_time_m.val:.3f}s, {update_sample_count / update_time_m.val:>7.2f}/s " f"({update_time_m.avg:.3f}s, {update_sample_count / update_time_m.avg:>7.2f}/s) " diff --git a/examples/question-answering/README.md b/examples/question-answering/README.md index c7414c777d..d7a83ea5c8 100755 --- a/examples/question-answering/README.md +++ b/examples/question-answering/README.md @@ -33,163 +33,6 @@ First, you should install the requirements: pip install -r requirements.txt ``` -## Fine-tuning BERT on SQuAD1.1 - -For the following cases, an example of a Gaudi configuration file is given -[here](https://github.com/huggingface/optimum-habana#how-to-use-it). - - -### Single-card Training - -This example code fine-tunes BERT on the SQuAD1.1 dataset. - -```bash -python run_qa.py \ - --model_name_or_path bert-large-uncased-whole-word-masking \ - --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \ - --dataset_name squad \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 32 \ - --per_device_eval_batch_size 8 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/squad/ \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --throughput_warmup_steps 3 \ - --bf16 \ - --sdp_on_bf16 -``` - -For torch.compile mode, -```bash -PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python run_qa.py \ - --model_name_or_path bert-large-uncased-whole-word-masking \ - --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \ - --dataset_name squad \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 32 \ - --per_device_eval_batch_size 8 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/squad/ \ - --use_habana \ - --torch_compile_backend hpu_backend \ - --torch_compile \ - --use_lazy_mode false \ - --throughput_warmup_steps 3 \ - --bf16 \ - --sdp_on_bf16 -``` - -### Multi-card Training - -Here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using the `run_qa` script, with 8 HPUs: - -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_qa.py \ - --model_name_or_path bert-large-uncased-whole-word-masking \ - --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \ - --dataset_name squad \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 32 \ - --per_device_eval_batch_size 8 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/squad_output/ \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --throughput_warmup_steps 3 \ - --bf16 \ - --sdp_on_bf16 -``` - -For torch.compile mode, -```bash -PT_HPU_LAZY_MODE=0 PT_ENABLE_INT64_SUPPORT=1 python ../gaudi_spawn.py \ - --world_size 8 --use_mpi run_qa.py \ - --model_name_or_path bert-large-uncased-whole-word-masking \ - --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \ - --dataset_name squad \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 32 \ - --per_device_eval_batch_size 8 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/squad_output/ \ - --use_habana \ - --torch_compile_backend hpu_backend \ - --torch_compile \ - --use_lazy_mode false \ - --throughput_warmup_steps 3 \ - --bf16 \ - --sdp_on_bf16 -``` - - -### Using DeepSpeed - -Similarly to multi-card training, here is how you would fine-tune the BERT large model (with whole word masking) on the SQuAD dataset using DeepSpeed with 8 HPUs: - -```bash -python ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_qa.py \ - --model_name_or_path bert-large-uncased-whole-word-masking \ - --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \ - --dataset_name squad \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 32 \ - --per_device_eval_batch_size 8 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/squad_output/ \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --throughput_warmup_steps 3 \ - --deepspeed path_to_my_deepspeed_config \ - --sdp_on_bf16 -``` - -You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana. -Here is a DeepSpeed configuration you can use to train your models on Gaudi: -```json -{ - "steps_per_print": 64, - "train_batch_size": "auto", - "train_micro_batch_size_per_gpu": "auto", - "gradient_accumulation_steps": "auto", - "bf16": { - "enabled": true - }, - "gradient_clipping": 1.0, - "zero_optimization": { - "stage": 2, - "overlap_comm": false, - "reduce_scatter": false, - "contiguous_gradients": false - } -} -``` - ## Fine-tuning Llama on SQuAD1.1 > [!NOTE] @@ -199,7 +42,7 @@ Here is a command you can run to train a Llama model for question answering: ```bash python ../gaudi_spawn.py \ --world_size 8 --use_deepspeed run_qa.py \ - --model_name_or_path FlagAlpha/Llama2-Chinese-13b-Chat \ + --model_name_or_path meta-llama/Llama-2-7b-chat-hf \ --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \ --dataset_name squad \ --do_train \ @@ -224,77 +67,3 @@ python ../gaudi_spawn.py \ ## Inference To run only inference, you can start from the commands above and you just have to remove the training-only arguments such as `--do_train`, `--per_device_train_batch_size`, `--num_train_epochs`, etc... - -For instance, you can run inference with BERT on SQuAD on 1 Gaudi card with the following command: -```bash -python run_qa.py \ - --model_name_or_path bert-large-uncased-whole-word-masking \ - --gaudi_config_name Habana/bert-large-uncased-whole-word-masking \ - --dataset_name squad \ - --do_eval \ - --per_device_eval_batch_size 8 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/squad/ \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --bf16 \ - --sdp_on_bf16 -``` - - -## Recommended Hyperparameters for Mixed Precision - -| | learning_rate | num_train_epochs | per_device_train_batch_size | per_device_eval_batch_size | -|----------------------------|:----:|:--:|:-:|:-:| -| BERT base | 3e-5 | 2 | 24 | 8 | -| BERT large | 3e-5 | 2 | 24 | 8 | -| RoBERTa base | 3e-5 | 2 | 12 | 8 | -| RoBERTa large | 3e-5 | 2 | 12 | 8 | -| ALBERT large (single-card) | 5e-5 | 2 | 32 | 4 | -| ALBERT large (multi-card) | 6e-5 | 2 | 32 | 4 | -| ALBERT XXL (single-card) | 5e-6 | 2 | 16 | 2 | -| ALBERT XXL (multi-card) | 5e-5 | 2 | 16 | 2 | -| DistilBERT | 5e-5 | 3 | 8 | 8 | -| meta-llama/Llama-2-13b-chat-hf (multi-card) | 3e-5 | 2 | 8 | 8 | -| FlagAlpha/Llama2-Chinese-13b-Chat (multi-card) | 3e-5 | 2 | 8 | 8 | - - -## Fine-tuning T5 on SQuAD2.0 - -The [`run_seq2seq_qa.py`](https://github.com/huggingface/optimum-habana/blob/main/examples/question-answering/run_seq2seq_qa.py) script is meant for encoder-decoder (also called seq2seq) Transformer models, such as T5 or BART. These models are generative, rather than discriminative. This means that they learn to generate the correct answer, rather than predicting the start and end position of the tokens of the answer. - -The following command fine-tunes T5 on the SQuAD2.0 dataset: - -```bash -python run_seq2seq_qa.py \ - --model_name_or_path t5-small \ - --gaudi_config_name Habana/t5 \ - --dataset_name squad_v2 \ - --version_2_with_negative \ - --context_column context \ - --question_column question \ - --answer_column answers \ - --do_train \ - --do_eval \ - --per_device_train_batch_size 16 \ - --per_device_eval_batch_size 33 \ - --learning_rate 3e-5 \ - --num_train_epochs 2 \ - --max_seq_length 384 \ - --doc_stride 128 \ - --output_dir /tmp/seq2seq_squad/ \ - --predict_with_generate \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --ignore_pad_token_for_loss False \ - --pad_to_max_length \ - --save_strategy epoch \ - --throughput_warmup_steps 3 \ - --sdp_on_bf16 \ - --bf16 -``` - -For multi-card and DeepSpeed runs, you can use `python ../gaudi_spawn.py --world_size 8 --use_mpi` and `python ../gaudi_spawn.py --world_size 8 --use_deepspeed` as shown in the previous sections. diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py index 5ad77be381..e7975cff94 100644 --- a/examples/question-answering/run_qa.py +++ b/examples/question-answering/run_qa.py @@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py index aaadbee417..dc9be22d4d 100644 --- a/examples/question-answering/run_seq2seq_qa.py +++ b/examples/question-answering/run_seq2seq_qa.py @@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/sentence-transformers-training/nli/README.md b/examples/sentence-transformers-training/nli/README.md index 7a1b0079a9..4d21543da6 100644 --- a/examples/sentence-transformers-training/nli/README.md +++ b/examples/sentence-transformers-training/nli/README.md @@ -4,6 +4,13 @@ Given two sentences (premise and hypothesis), the task of Natural Language Infer The paper in [Conneau et al.](https://arxiv.org/abs/1705.02364) shows that NLI data can be quite useful when training Sentence Embedding methods. In [Sentence-BERT-Paper](https://arxiv.org/abs/1908.10084) NLI as a first fine-tuning step for sentence embedding methods has been used. +## Requirements + +First, you should install the requirements: +```bash +pip install -r requirements.txt +``` + # General Models ## Single-card Training @@ -39,6 +46,7 @@ test_dataset = load_dataset("sentence-transformers/stsb", split="test") ```bash python training_nli.py bert-base-uncased ``` +If you want to save the checkpoints for the model you need using `--saving_model_checkpoints` in the command and same for all examples below. ## Multi-card Training @@ -59,16 +67,16 @@ Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 1 python training_nli.py intfloat/e5-mistral-7b-instruct --peft --lora_target_module "q_proj" "k_proj" "v_proj" --learning_rate 1e-5 ``` -## Multi-card Training with Deepspeed Zero2/3 +## Multi-card Training with Deepspeed Zero3 -Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we can use the Zero2/Zero3 stages of DeepSpeed (model parallelism) to reduce the memory requirements. +Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we will use the Zero3 stages of DeepSpeed (model parallelism) to reduce the memory requirements. -Our tests have shown that training this model requires at least four HPUs when using DeepSpeed Zero2. +Our tests have shown that training this model requires at least four HPUs when using DeepSpeed Zero3. ```bash python ../../gaudi_spawn.py --world_size 4 --use_deepspeed training_nli.py intfloat/e5-mistral-7b-instruct --deepspeed ds_config.json --bf16 --no-use_hpu_graphs_for_training --learning_rate 1e-7 ``` -In the above command, we need to enable lazy mode with a learning rate of `1e-7` and configure DeepSpeed using the `ds_config.json` file. To further reduce memory usage, change the stage to 3 (DeepSpeed Zero3) in the `ds_config.json` file. +In the above command, we need to enable lazy mode with a learning rate of `1e-7` and configure DeepSpeed using the `ds_config.json` file. # Dataset diff --git a/examples/sentence-transformers-training/nli/ds_config.json b/examples/sentence-transformers-training/nli/ds_config.json index 5d5b80af99..565d31b6d1 100644 --- a/examples/sentence-transformers-training/nli/ds_config.json +++ b/examples/sentence-transformers-training/nli/ds_config.json @@ -8,7 +8,7 @@ }, "gradient_clipping": 1.0, "zero_optimization": { - "stage": 2, + "stage": 3, "overlap_comm": false, "reduce_scatter": false, "contiguous_gradients": false diff --git a/examples/sentence-transformers-training/nli/requirements.txt b/examples/sentence-transformers-training/nli/requirements.txt new file mode 100644 index 0000000000..1b97e4c3d7 --- /dev/null +++ b/examples/sentence-transformers-training/nli/requirements.txt @@ -0,0 +1,2 @@ +datasets <= 2.19.2 +peft diff --git a/examples/sentence-transformers-training/nli/training_nli.py b/examples/sentence-transformers-training/nli/training_nli.py index 2d051cedea..53b6e7ad47 100644 --- a/examples/sentence-transformers-training/nli/training_nli.py +++ b/examples/sentence-transformers-training/nli/training_nli.py @@ -30,6 +30,7 @@ def main(): # You can specify any Hugging Face pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base parser = argparse.ArgumentParser() parser.add_argument("model_name", help="model name or path", default="bert-base-uncased", nargs="?") + parser.add_argument("--saving_model_checkpoints", help="saving checkpoints", action="store_true", default=False) parser.add_argument("--peft", help="use LoRA", action="store_true", default=False) parser.add_argument("--lora_target_modules", nargs="+", default=["query", "key", "value"]) parser.add_argument("--bf16", help="use bf16", action="store_true", default=False) @@ -107,7 +108,7 @@ def main(): # Optional tracking/debugging parameters: evaluation_strategy="steps", eval_steps=100, - save_strategy="steps", + save_strategy="steps" if args.saving_model_checkpoints else "no", save_steps=100, save_total_limit=2, logging_steps=100, @@ -146,10 +147,11 @@ def main(): test_evaluator(model) # 8. Save the trained & evaluated model locally - final_output_dir = f"{output_dir}/final" - model.save(final_output_dir) + if args.saving_model_checkpoints: + final_output_dir = f"{output_dir}/final" + model.save(final_output_dir) - if args.peft: + if args.saving_model_checkpoints and args.peft: model.eval() model = model.merge_and_unload() model.save_pretrained(f"{output_dir}/merged") diff --git a/examples/sentence-transformers-training/paraphrases/README.md b/examples/sentence-transformers-training/paraphrases/README.md index 8961172025..1e95a425d1 100644 --- a/examples/sentence-transformers-training/paraphrases/README.md +++ b/examples/sentence-transformers-training/paraphrases/README.md @@ -4,6 +4,12 @@ To fine-tune on the paraphrase task: +0. Install required packages + + ```sh + pip install -r requirements.txt + ``` + 1. Choose a pre-trained model `` (For example: `bert-base-uncased`). 2. Choose the training, evaluation, and test dataset(s). Here, we use a dataset dictionary to include multiple datasets. diff --git a/examples/sentence-transformers-training/paraphrases/requirements.txt b/examples/sentence-transformers-training/paraphrases/requirements.txt new file mode 100644 index 0000000000..b776a8dd19 --- /dev/null +++ b/examples/sentence-transformers-training/paraphrases/requirements.txt @@ -0,0 +1 @@ +datasets <= 2.19.2 diff --git a/examples/sentence-transformers-training/sts/README.md b/examples/sentence-transformers-training/sts/README.md index 3ca2602012..61e5af90f4 100644 --- a/examples/sentence-transformers-training/sts/README.md +++ b/examples/sentence-transformers-training/sts/README.md @@ -5,6 +5,13 @@ Semantic Textual Similarity (STS) assigns a score on the similarity of two texts - **[training_stsbenchmark.py](training_stsbenchmark.py)** - This example shows how to create a SentenceTransformer model from scratch by using a pre-trained transformer model (e.g. [`distilbert-base-uncased`](https://huggingface.co/distilbert/distilbert-base-uncased)) together with a pooling layer. - **[training_stsbenchmark_continue_training.py](training_stsbenchmark_continue_training.py)** - This example shows how to continue training on STS data for a previously created & trained SentenceTransformer model (e.g. [`all-mpnet-base-v2`](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)). +## Requirements + +First, you should install the requirements: +```bash +pip install -r requirements.txt +``` + # General Models ## Single-card Training @@ -26,6 +33,7 @@ test_dataset = load_dataset("sentence-transformers/stsb", split="test") ```bash python training_stsbenchmark.py bert-base-uncased ``` +If you want to save the checkpoints for training model you need using `--saving_model_checkpoints` in the command and same for all examples below. ## Multi-card Training @@ -46,17 +54,17 @@ Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 1 python training_stsbenchmark.py intfloat/e5-mistral-7b-instruct --peft --lora_target_modules "q_proj" "k_proj" "v_proj" ``` -## Multi-card Training with Deepspeed Zero2/3 +## Multi-card Training with Deepspeed Zero3 -Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we can use the Zero2/Zero3 stages of DeepSpeed (model parallelism) to reduce the memory requirements. +Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we will use the Zero3 stages of DeepSpeed (model parallelism) to reduce the memory requirements. -Our tests have shown that training this model requires at least four HPUs when using DeepSpeed Zero2. +Our tests have shown that training this model requires at least four HPUs when using DeepSpeed Zero3. ```bash python ../../gaudi_spawn.py --world_size 4 --use_deepspeed training_stsbenchmark.py intfloat/e5-mistral-7b-instruct --deepspeed ds_config.json --bf16 --no-use_hpu_graphs_for_training --learning_rate 1e-7 ``` -In the above command, we need to enable lazy mode with a learning rate of `1e-7` and configure DeepSpeed using the `ds_config.json` file. To further reduce memory usage, change the stage to 3 (DeepSpeed Zero3) in the `ds_config.json` file. +In the above command, we need to enable lazy mode with a learning rate of `1e-7` and configure DeepSpeed using the `ds_config.json` file. # Training data diff --git a/examples/sentence-transformers-training/sts/ds_config.json b/examples/sentence-transformers-training/sts/ds_config.json index 5d5b80af99..565d31b6d1 100644 --- a/examples/sentence-transformers-training/sts/ds_config.json +++ b/examples/sentence-transformers-training/sts/ds_config.json @@ -8,7 +8,7 @@ }, "gradient_clipping": 1.0, "zero_optimization": { - "stage": 2, + "stage": 3, "overlap_comm": false, "reduce_scatter": false, "contiguous_gradients": false diff --git a/examples/sentence-transformers-training/sts/requirements.txt b/examples/sentence-transformers-training/sts/requirements.txt new file mode 100644 index 0000000000..1b97e4c3d7 --- /dev/null +++ b/examples/sentence-transformers-training/sts/requirements.txt @@ -0,0 +1,2 @@ +datasets <= 2.19.2 +peft diff --git a/examples/sentence-transformers-training/sts/training_stsbenchmark.py b/examples/sentence-transformers-training/sts/training_stsbenchmark.py index 4dd0b8884b..c76733bf64 100644 --- a/examples/sentence-transformers-training/sts/training_stsbenchmark.py +++ b/examples/sentence-transformers-training/sts/training_stsbenchmark.py @@ -27,6 +27,7 @@ def main(): # You can specify any Hugging Face pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base parser = argparse.ArgumentParser() parser.add_argument("model_name", help="model name or path", default="distilbert-base-uncased", nargs="?") + parser.add_argument("--saving_model_checkpoints", help="saving checkpoints", action="store_true", default=False) parser.add_argument("--peft", help="use LoRA", action="store_true", default=False) parser.add_argument("--lora_target_modules", nargs="+", default=["q_lin", "k_lin", "v_lin"]) parser.add_argument("--bf16", help="use bf16", action="store_true", default=False) @@ -104,7 +105,7 @@ def main(): # Optional tracking/debugging parameters: evaluation_strategy="steps", eval_steps=100, - save_strategy="steps", + save_strategy="steps" if args.saving_model_checkpoints else "no", save_steps=100, save_total_limit=2, logging_steps=100, @@ -142,10 +143,11 @@ def main(): test_evaluator(model) # 8. Save the trained & evaluated model locally - final_output_dir = f"{output_dir}/final" - model.save(final_output_dir) + if args.saving_model_checkpoints: + final_output_dir = f"{output_dir}/final" + model.save(final_output_dir) - if args.peft: + if args.saving_model_checkpoints and args.peft: model.eval() model = model.merge_and_unload() model.save_pretrained(f"{output_dir}/merged") diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md index fe80cf775f..d51d990db7 100644 --- a/examples/speech-recognition/README.md +++ b/examples/speech-recognition/README.md @@ -89,7 +89,7 @@ python run_speech_recognition_ctc.py \ --bf16 \ --use_hpu_graphs_for_training \ --use_hpu_graphs_for_inference \ - --sdp_on_bf16 + --attn_implementation sdpa ``` On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**. @@ -132,7 +132,7 @@ python ../gaudi_spawn.py \ --sdp_on_bf16 \ --use_hpu_graphs_for_training \ --use_hpu_graphs_for_inference \ - --sdp_on_bf16 + --attn_implementation sdpa ``` On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**. @@ -145,7 +145,7 @@ On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of ** > You need to install DeepSpeed with: > ```bash -> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 +> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 > ``` DeepSpeed can be used with almost the same command as for a multi-card run: @@ -181,7 +181,8 @@ python ../gaudi_spawn.py \ --gaudi_config_name Habana/wav2vec2 \ --throughput_warmup_steps 3 \ --deepspeed ../../tests/configs/deepspeed_zero_2.json \ - --sdp_on_bf16 + --sdp_on_bf16 \ + --attn_implementation sdpa ``` [The documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) provides more information about how to use DeepSpeed within Optimum Habana. @@ -214,8 +215,7 @@ python run_speech_recognition_ctc.py \ --gaudi_config_name="Habana/wav2vec2" \ --sdp_on_bf16 \ --bf16 \ - --use_hpu_graphs_for_inference \ - --sdp_on_bf16 + --use_hpu_graphs_for_inference ``` ## Sequence to Sequence diff --git a/examples/speech-recognition/requirements.txt b/examples/speech-recognition/requirements.txt index 3319dee2c7..b7c33c8ba1 100644 --- a/examples/speech-recognition/requirements.txt +++ b/examples/speech-recognition/requirements.txt @@ -1,4 +1,5 @@ datasets >= 1.18.0, <= 2.19.2 +numba==0.60.0 librosa jiwer evaluate diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py index 9d53e58519..00c877e089 100644 --- a/examples/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/speech-recognition/run_speech_recognition_ctc.py @@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") @@ -152,6 +152,33 @@ class ModelArguments: "useful to downsample the output length." }, ) + use_flash_attention: bool = field( + default=False, metadata={"help": "Whether to use Habana flash attention for fine-tuning"} + ) + flash_attention_recompute: bool = field( + default=False, + metadata={ + "help": "Whether to enable recompute in Habana flash attention for fine-tuning." + " It is applicable only when use_flash_attention is True." + }, + ) + flash_attention_fast_softmax: bool = field( + default=False, + metadata={ + "help": "Whether to use fast softmax for Habana flash attention." + " It is applicable only when use_flash_attention is True." + }, + ) + + def __post_init__(self): + if self.use_flash_attention: + os.environ["USE_FLASH_ATTENTION"] = "1" + if self.flash_attention_recompute: + assert self.use_flash_attention, "flash_attention_recompute is set, but use_flash_attention is not" + os.environ["FLASH_ATTENTION_RECOMPUTE"] = "1" + if self.flash_attention_fast_softmax: + assert self.use_flash_attention, "flash_attention_fast_softmax is set, but use_flash_attention is not" + os.environ["FLASH_ATTENTION_FAST_SOFTMAX"] = "1" @dataclass @@ -504,7 +531,7 @@ def main(): # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = ( - f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None + f"[{''.join(data_args.chars_to_ignore).replace(' ', '')}]" if data_args.chars_to_ignore is not None else None ) text_column_name = data_args.text_column_name @@ -535,6 +562,7 @@ def remove_special_characters(batch): cache_dir=model_args.cache_dir, token=data_args.token, trust_remote_code=data_args.trust_remote_code, + attn_implementation=training_args.attn_implementation, ) # 4. Next, if no tokenizer file is defined, diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py index db25b852eb..1933bc0767 100755 --- a/examples/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py @@ -56,7 +56,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md index 00759e9bf1..9919780543 100644 --- a/examples/stable-diffusion/README.md +++ b/examples/stable-diffusion/README.md @@ -16,10 +16,10 @@ limitations under the License. # Stable Diffusion Examples -This directory contains a script that showcases how to perform text-to-image generation using Stable Diffusion on IntelĀ® GaudiĀ® AI Accelerators. - -Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser and Robin Rombach and the Stability AI team. +This directory contains sample scripts demonstrating how to perform diffusion-based generative tasks on IntelĀ® GaudiĀ® AI Accelerators. +Stable Diffusion was introduced in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser, +Robin Rombach and the Stability AI team. ## Requirements @@ -28,11 +28,11 @@ First, you should install the requirements: pip install -r requirements.txt ``` -## Text-to-image Generation +## Text-to-Image Generation -### Single Prompt +### Stable Diffusion -Here is how to generate images with one prompt: +Here's how to generate images using the Stable Diffusion 1.4 model with a single prompt: ```bash python text_to_image_generation.py \ @@ -48,13 +48,12 @@ python text_to_image_generation.py \ --bf16 ``` +> [!NOTE] > HPU graphs are recommended when generating images by batches to get the fastest possible generations. > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster. > You can enable this mode with `--use_hpu_graphs`. -### Multiple Prompts - -Here is how to generate images with several prompts: +To generate images with multiple prompts, simply include two prompts in your input as shown below: ```bash python text_to_image_generation.py \ @@ -70,9 +69,7 @@ python text_to_image_generation.py \ --bf16 ``` -### Distributed inference with multiple HPUs - -Here is how to generate images with two prompts on two HPUs: +Distributed inference with multiple HPUs is also supported. Below is an example demonstrating how to generate images with two prompts on two HPUs: ```bash python ../gaudi_spawn.py \ @@ -90,13 +87,18 @@ python ../gaudi_spawn.py \ --distributed ``` +> [!NOTE] > HPU graphs are recommended when generating images by batches to get the fastest possible generations. > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster. > You can enable this mode with `--use_hpu_graphs`. +You can run other older Stable Diffusion models in a similar manner. For example, to generate images with Stable Diffusion 1.5, use the option: +`--model_name_or_path stable-diffusion-v1-5/stable-diffusion-v1-5`. Examples showcasing Stable Diffusion 2 are provided next. + ### Stable Diffusion 2 -[Stable Diffusion 2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_2) can also be used to generate images with this script. Here is an example for a single prompt: +[Stable Diffusion 2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_2) can also be used +to generate images with this script. Here is an example demonstrating image generation with a single prompt: ```bash python text_to_image_generation.py \ @@ -114,17 +116,18 @@ python text_to_image_generation.py \ --bf16 ``` +> [!NOTE] > There are two different checkpoints for Stable Diffusion 2: -> > - use [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1) for generating 768x768 images > - use [stabilityai/stable-diffusion-2-1-base](https://huggingface.co/stabilityai/stable-diffusion-2-1-base) for generating 512x512 images ### Latent Diffusion Model for 3D (LDM3D) -[LDM3D](https://arxiv.org/abs/2305.10853) generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts. +[LDM3D](https://arxiv.org/abs/2305.10853) generates both image and depth map data from a given text prompt, allowing users +to generate RGBD images from text prompts. -[Original checkpoint](https://huggingface.co/Intel/ldm3d) and [latest checkpoint](https://huggingface.co/Intel/ldm3d-4c) are open source. -A [demo](https://huggingface.co/spaces/Intel/ldm3d) is also available. Here is how to run this model: +[Original checkpoint](https://huggingface.co/Intel/ldm3d) and [latest checkpoint](https://huggingface.co/Intel/ldm3d-4c) +are open source. A [demo](https://huggingface.co/spaces/Intel/ldm3d) is also available. Here is how to run this model: ```bash python text_to_image_generation.py \ @@ -144,8 +147,7 @@ python text_to_image_generation.py \ Here is how to generate images and depth maps with two prompts on two HPUs: ```bash -python ../gaudi_spawn.py \ - --world_size 2 text_to_image_generation.py \ +python ../gaudi_spawn.py --world_size 2 text_to_image_generation.py \ --model_name_or_path "Intel/ldm3d-4c" \ --prompts "An image of a squirrel in Picasso style" "A shiny flying horse taking off" \ --num_images_per_prompt 10 \ @@ -160,15 +162,16 @@ python ../gaudi_spawn.py \ --distributed ``` +> [!NOTE] > There are three different checkpoints for LDM3D: -> > - use [original checkpoint](https://huggingface.co/Intel/ldm3d) to generate outputs from the paper > - use [the latest checkpoint](https://huggingface.co/Intel/ldm3d-4c) for generating improved results > - use [the pano checkpoint](https://huggingface.co/Intel/ldm3d-pano) to generate panoramic view ### Stable Diffusion XL (SDXL) -Stable Diffusion XL was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/pdf/2307.01952.pdf) by the Stability AI team. +Stable Diffusion XL was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/pdf/2307.01952.pdf) +by the Stability AI team. Here is how to generate SDXL images with a single prompt: @@ -178,6 +181,7 @@ python text_to_image_generation.py \ --prompts "Sailing ship painting by Van Gogh" \ --num_images_per_prompt 28 \ --batch_size 7 \ + --num_inference_steps 30 \ --image_save_dir /tmp/stable_diffusion_xl_images \ --scheduler euler_discrete \ --use_habana \ @@ -187,30 +191,12 @@ python text_to_image_generation.py \ --bf16 ``` +> [!NOTE] > HPU graphs are recommended when generating images by batches to get the fastest possible generations. > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster. > You can enable this mode with `--use_hpu_graphs`. -Here is how to generate SDXL images with several prompts: - -```bash -python text_to_image_generation.py \ - --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ - --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \ - --num_images_per_prompt 32 \ - --batch_size 8 \ - --image_save_dir /tmp/stable_diffusion_xl_images \ - --scheduler euler_discrete \ - --use_habana \ - --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion \ - --sdp_on_bf16 \ - --bf16 -``` - -SDXL combines a second text encoder (OpenCLIP ViT-bigG/14) with the original text encoder to significantly -increase the number of parameters. Here is how to generate images with several prompts for both `prompt` -and `prompt_2` (2nd text encoder), as well as their negative prompts: +SDXL integrates a second text encoder (OpenCLIP ViT-bigG/14), alongside the original Stable Diffusion text encoder. This addition significantly increases the number of parameters, enabling more detailed and descriptive prompts. Below is an example of how to generate images using multiple prompts for both `prompt` (primary text encoder) and `prompt_2` (secondary text encoder), along with their respective negative prompts: ```bash python text_to_image_generation.py \ @@ -230,11 +216,10 @@ python text_to_image_generation.py \ --bf16 ``` -Here is how to generate SDXL images with two prompts on two HPUs: +SDXL also supports distributed inferencing with Intel Gaudi accelerators. Below is an example of generating SDXL images in a distributed manner using two prompts on two HPUs: ```bash -python ../gaudi_spawn.py \ - --world_size 2 text_to_image_generation.py \ +python ../gaudi_spawn.py --world_size 2 text_to_image_generation.py \ --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \ --prompts_2 "Red tone" "Blue tone" \ @@ -252,26 +237,13 @@ python ../gaudi_spawn.py \ --distributed ``` -Here is how to generate SDXL images with optimized pipeline: -```bash -python text_to_image_generation.py \ - --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ - --prompts "Sailing ship painting by Van Gogh" \ - --num_images_per_prompt 28 \ - --batch_size 7 \ - --image_save_dir /tmp/stable_diffusion_xl_images \ - --scheduler euler_discrete \ - --use_habana \ - --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion \ - --sdp_on_bf16 \ - --bf16 \ - --optimize -``` +The performance-optimized SDXL pipeline can be enabled using the `--optimize` option. This option utilizes a more aggressively optimized attention mechanism for enhanced performance. Additionally, it supports running +inference in mixed FP8 precision. -Here is how to generate SDXL images with optimized pipeline in fp8: +Here is how to generate SDXL images with optimized pipeline in FP8 precision: ```bash -QUANT_CONFIG=./quantization/quant_config.json python text_to_image_generation.py \ +QUANT_CONFIG=quantization/stable-diffusion-xl/quantize_config.json \ +python text_to_image_generation.py \ --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ --prompts "Sailing ship painting by Van Gogh" \ --num_images_per_prompt 28 \ @@ -286,13 +258,11 @@ QUANT_CONFIG=./quantization/quant_config.json python text_to_image_generation.py --optimize ``` -> HPU graphs are recommended when generating images by batches to get the fastest possible generations. -> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster. -> You can enable this mode with `--use_hpu_graphs`. - ### SDXL-Turbo -SDXL-Turbo is a distilled version of SDXL 1.0, trained for real-time synthesis. +The knowledge distillation technique can be used to train a distilled version of SDXL, allowing for high-quality +image generation with fewer inference steps. SDXL-Turbo is a distilled version of Stable Diffusion XL 1.0, +optimized for real-time synthesis. Here is how to generate images with multiple prompts: @@ -314,11 +284,9 @@ python text_to_image_generation.py \ --timestep_spacing trailing ``` -> HPU graphs are recommended when generating images by batches to get the fastest possible generations. -> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster. -> You can enable this mode with `--use_hpu_graphs`. - -> Note: there is a regression with "--guidance_scale 0.0" in current release which will be addressed in later releases. Setting `--guidance_scale` to a value larger than 1 resolves the regression. +> [!WARNING] +> There is a regression with `--guidance_scale 0.0` in current release which will be addressed in later releases. +> Setting `--guidance_scale` to a value larger than 1 resolves the regression. ### Stable Diffusion 3 (SD3) @@ -337,7 +305,6 @@ huggingface-cli login Here is how to generate SD3 images with a single prompt: ```bash -PT_HPU_MAX_COMPOUND_OP_SIZE=1 \ python text_to_image_generation.py \ --model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers \ --prompts "Sailing ship painting by Van Gogh" \ @@ -353,14 +320,53 @@ python text_to_image_generation.py \ --bf16 ``` -> For improved performance of the SD3 pipeline on Gaudi, it is recommended to configure the environment -> by setting PT_HPU_MAX_COMPOUND_OP_SIZE to 1. +This model can also be quantized with some ops running in FP8 precision. + +Before quantization, run stats collection using measure mode: + +```bash +QUANT_CONFIG=quantization/stable-diffusion-3/measure_config.json \ +python text_to_image_generation.py \ + --model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers \ + --prompts "Sailing ship painting by Van Gogh" \ + --num_images_per_prompt 10 \ + --batch_size 1 \ + --num_inference_steps 28 \ + --image_save_dir /tmp/stable_diffusion_3_images \ + --scheduler default \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 \ + --quant_mode measure +``` + +After stats collection, here is how to run SD3 in quantization mode: + +```bash +QUANT_CONFIG=quantization/stable-diffusion-3/quantize_config.json \ +python text_to_image_generation.py \ + --model_name_or_path stabilityai/stable-diffusion-3-medium-diffusers \ + --prompts "Sailing ship painting by Van Gogh" \ + --num_images_per_prompt 10 \ + --batch_size 1 \ + --num_inference_steps 28 \ + --image_save_dir /tmp/stable_diffusion_3_images \ + --scheduler default \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 \ + --quant_mode quantize +``` ### FLUX.1 FLUX.1 was introduced by Black Forest Labs [here](https://blackforestlabs.ai/announcing-black-forest-labs/). -Here is how to run FLUX.1-schnell model (fast version of FLUX.1): +Here is how to run FLUX.1-schnell model (distilled fast version of FLUX.1): ```bash python text_to_image_generation.py \ @@ -370,7 +376,7 @@ python text_to_image_generation.py \ --batch_size 1 \ --num_inference_steps 4 \ --image_save_dir /tmp/flux_1_images \ - --scheduler flow_match_euler_discrete\ + --scheduler flow_match_euler_discrete \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ @@ -397,7 +403,7 @@ python text_to_image_generation.py \ --batch_size 1 \ --num_inference_steps 30 \ --image_save_dir /tmp/flux_1_images \ - --scheduler flow_match_euler_discrete\ + --scheduler flow_match_euler_discrete \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ @@ -418,7 +424,7 @@ python text_to_image_generation.py \ --batch_size 1 \ --num_inference_steps 30 \ --image_save_dir /tmp/flux_1_images \ - --scheduler flow_match_euler_discrete\ + --scheduler flow_match_euler_discrete \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ @@ -438,7 +444,7 @@ python text_to_image_generation.py \ --batch_size 1 \ --num_inference_steps 30 \ --image_save_dir /tmp/flux_1_images \ - --scheduler flow_match_euler_discrete\ + --scheduler flow_match_euler_discrete \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ @@ -449,10 +455,11 @@ python text_to_image_generation.py \ ## ControlNet -ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala. -It is a type of model for controlling StableDiffusion by conditioning the model with an additional input image. -Here is how to generate images conditioned by canny edge model: +ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) +by Lvmin Zhang and Maneesh Agrawala, enables conditioning the Stable Diffusion model with an additional input image. This allows for precise control over the composition of generated images using various features such as edges, pose, depth, and more. + +Here is how to generate images conditioned by Canny edge model: ```bash python text_to_image_generation.py \ @@ -470,29 +477,11 @@ python text_to_image_generation.py \ --bf16 ``` -Here is how to generate images conditioned by canny edge model and with multiple prompts: +The ControlNet example can be run with multiple prompts by supplying more than one prompt in the input. +Additionally, it supports distributed execution. Below is an example of generating images conditioned by the Canny edge model using two prompts on two HPUs: ```bash -python text_to_image_generation.py \ - --model_name_or_path CompVis/stable-diffusion-v1-4 \ - --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \ - --prompts "futuristic-looking woman" "a rusty robot" \ - --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \ - --num_images_per_prompt 28 \ - --batch_size 7 \ - --image_save_dir /tmp/controlnet_images \ - --use_habana \ - --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion \ - --sdp_on_bf16 \ - --bf16 -``` - -Here is how to generate images conditioned by canny edge model and with two prompts on two HPUs: - -```bash -python ../gaudi_spawn.py \ - --world_size 2 text_to_image_generation.py \ +python ../gaudi_spawn.py --world_size 2 text_to_image_generation.py \ --model_name_or_path CompVis/stable-diffusion-v1-4 \ --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \ --prompts "futuristic-looking woman" "a rusty robot" \ @@ -508,44 +497,7 @@ python ../gaudi_spawn.py \ --distributed ``` -Here is how to generate images conditioned by open pose model: - -```bash -python text_to_image_generation.py \ - --model_name_or_path CompVis/stable-diffusion-v1-4 \ - --controlnet_model_name_or_path lllyasviel/sd-controlnet-openpose \ - --prompts "Chef in the kitchen" \ - --control_image https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png \ - --control_preprocessing_type "none" \ - --num_images_per_prompt 28 \ - --batch_size 7 \ - --image_save_dir /tmp/controlnet_images \ - --use_habana \ - --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion \ - --sdp_on_bf16 \ - --bf16 -``` - -Here is how to generate images with conditioned by canny edge model using Stable Diffusion 2 - -```bash -python text_to_image_generation.py \ - --model_name_or_path stabilityai/stable-diffusion-2-1 \ - --controlnet_model_name_or_path thibaud/controlnet-sd21-canny-diffusers \ - --control_image https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png \ - --control_preprocessing_type "none" \ - --prompts "bird" \ - --seed 0 \ - --num_images_per_prompt 28 \ - --batch_size 7 \ - --image_save_dir /tmp/controlnet-2-1_images \ - --use_habana \ - --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion-2 \ - --sdp_on_bf16 \ - --bf16 -``` +These ControlNet examples will preprocess the input image to derive Canny edges. Alternatively, you can use `--control_preprocessing_type none` to supply a preprocessed control image directly, enabling many additional use cases. ## Inpainting @@ -575,7 +527,7 @@ python text_to_image_generation.py \ ```bash python text_to_image_generation.py \ - --model_name_or_path diffusers/stable-diffusion-xl-1.0-inpainting-0.1\ + --model_name_or_path diffusers/stable-diffusion-xl-1.0-inpainting-0.1 \ --base_image https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png \ --mask_image https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png \ --prompts "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" \ @@ -591,44 +543,106 @@ python text_to_image_generation.py \ --bf16 ``` -## Image-to-image Generation +## Additional Stable Diffusion-based Inference Techniques -### Single Prompt +This section provides examples of additional inference techniques based on Stable Diffusion. For more details, please refer to +[Hugging Face Diffusers documentation](https://huggingface.co/docs/diffusers/main/en/using-diffusers/overview_techniques). -Here is how to generate images with one prompt and one image. -Take instruct-pix2pix as an example. +### Unconditional Image Generation + +Here is how to perform unconditional image generation on Intel Gaudi. For more details, please refer to the +[Unconditional Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/unconditional_image_generation) +section in the Hugging Face documentation. ```bash -python image_to_image_generation.py \ - --model_name_or_path "timbrooks/instruct-pix2pix" \ - --src_image_path "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/main/imgs/example.jpg" \ - --prompts "turn him into cyborg" \ - --num_images_per_prompt 20 \ - --batch_size 4 \ - --guidance_scale 7.5 \ - --image_guidance_scale 1 \ - --num_inference_steps 10 \ - --image_save_dir /tmp/stable_diffusion_images \ +python unconditional_image_generation.py \ + --model_name_or_path "google/ddpm-ema-celebahq-256" \ + --batch_size 16 \ --use_habana \ + --use_gaudi_ddim_scheduler \ --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 \ + --save_outputs \ + --output_dir "/tmp/" +``` + +### Controlling Brightness + +Here is an example of how to control brightness. For more information, please refer to the +[Control Brightness](https://huggingface.co/docs/diffusers/main/en/using-diffusers/control_brightness) +section in the Hugging Face documentation. + +```bash +PT_HPU_MAX_COMPOUND_OP_SIZE=1 \ +python text_to_image_generation.py \ + --model_name_or_path ptx0/pseudo-journey-v2 \ + --prompts "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" \ + --num_images_per_prompt 1 \ + --batch_size 1 \ + --use_habana \ + --use_hpu_graphs \ + --image_save_dir /tmp/stable_diffusion_images_brightness \ + --seed 33 \ + --use_zero_snr \ + --guidance_scale 0.7 \ + --timestep_spacing trailing +``` + +### Prompt Weighting + +Here is an example of how to run prompt weighting. For more information, please refer to the +[Weighted Prompts](https://huggingface.co/docs/diffusers/main/en/using-diffusers/weighted_prompts) +section in the Hugging Face documentation. + +```bash +python text_to_image_generation.py \ + --model_name_or_path CompVis/stable-diffusion-v1-4 \ + --prompts "a red cat playing with a ball+++" "a red cat playing with a ball---" \ + --num_images_per_prompt 4 \ + --batch_size 4 \ + --use_habana --use_hpu_graphs \ + --image_save_dir /tmp/stable_diffusion_images_compel \ + --seed 33 \ + --sdp_on_bf16 \ + --bf16 \ + --num_inference_steps 20 \ + --use_compel +``` + +### Controlling Image Quality + +Here is an example of how to improve image quality. For more details, please refer to the +[Image Quality](https://huggingface.co/docs/diffusers/main/en/using-diffusers/image_quality) +section in the Hugging Face documentation. + +```bash +python text_to_image_generation.py \ + --model_name_or_path CompVis/stable-diffusion-v1-4 \ + --prompts "A squirrel eating a burger" \ + --num_images_per_prompt 4 \ + --batch_size 4 \ + --use_habana \ + --image_save_dir /tmp/stable_diffusion_images_freeu \ + --seed 33 \ + --use_freeu \ --sdp_on_bf16 \ --bf16 ``` -> HPU graphs are recommended when generating images by batches to get the fastest possible generations. -> The first batch of images entails a performance penalty. All subsequent batches will be generated much faster. -> You can enable this mode with `--use_hpu_graphs`. +## Image-to-Image Generation + +Images can also be generated using initial input images to guide the diffusion-based image generation process. -### Multiple Prompts +### Stable Diffusion-based Image-to-Image -Here is how to generate images with several prompts and one image. +Here is how to generate images using a single prompt and an input image with the `timbrooks/instruct-pix2pix` model, which is based on Stable Diffusion: ```bash python image_to_image_generation.py \ --model_name_or_path "timbrooks/instruct-pix2pix" \ --src_image_path "https://raw.githubusercontent.com/timothybrooks/instruct-pix2pix/main/imgs/example.jpg" \ - --prompts "turn him into cyborg" "a strong soldier"\ + --prompts "turn him into cyborg" \ --num_images_per_prompt 20 \ --batch_size 4 \ --guidance_scale 7.5 \ @@ -642,13 +656,14 @@ python image_to_image_generation.py \ --bf16 ``` +> [!NOTE] > HPU graphs are recommended when generating images by batches to get the fastest possible generations. > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster. > You can enable this mode with `--use_hpu_graphs`. ### Stable Diffusion XL Refiner -Here is how to generate SDXL images with a single prompt and one image: +Here is how to refine SDXL images using a single image and prompt: ```bash python image_to_image_generation.py \ @@ -667,17 +682,17 @@ python image_to_image_generation.py \ --bf16 ``` -### FLUX.1 Image to Image +### FLUX.1 Image-to-Image -Here is how to generate FLUX.1 images with a single prompt and one input image: +Here is how to generate a FLUX.1 image using a single input image and prompt: ```bash python image_to_image_generation.py \ --model_name_or_path "black-forest-labs/FLUX.1-dev" \ --src_image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" \ --prompts "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k" \ - --num_images_per_prompt 40 \ - --batch_size 10 \ + --num_images_per_prompt 10 \ + --batch_size 1 \ --strength 0.9 \ --guidance_scale 3.5 \ --num_inference_steps 30 \ @@ -691,7 +706,7 @@ python image_to_image_generation.py \ ### Stable Diffusion Image Variations -Here is how to generate images with one image, it does not accept prompt input +Here is how to generate image variations of a single image (without any input prompts): ```bash python image_to_image_generation.py \ @@ -710,7 +725,7 @@ python image_to_image_generation.py \ ### Depth to Image Generation -Here is how to generate a depth2img-guided image generation using HPU graphs with BF16: +Here is an example of performing depth-guided image generation: ```bash python depth_to_image_generation.py \ @@ -724,88 +739,20 @@ python depth_to_image_generation.py \ --bf16 ``` -## Unconditional Image Generation Example - -Here is how to perform unconditional-image-generation on Gaudi/HPU. - -Original unconditional image generation pipeline is shared in here: [Unconditional Image Generation](https://huggingface.co/docs/diffusers/using-diffusers/unconditional_image_generation) - -```bash -python unconditional_image_generation.py \ - --model_name_or_path "google/ddpm-ema-celebahq-256" \ - --batch_size 16 \ - --use_habana \ - --use_gaudi_ddim_scheduler \ - --use_hpu_graphs \ - --sdp_on_bf16 \ - --bf16 \ - --save_outputs \ - --output_dir "/tmp/" -``` - -## Additional inference techniques - -Here is how to run the diffusers examples of inference techniques. For more details, -please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/overview_techniques). +## Text-to-Video Generation -### Controlling brightness - -Here is how to run the example of controlling brightness. For more details, -please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/control_brightness). +This section demonstrates how to use the `GaudiTextToVideoSDPipeline` for text-to-video generation tasks on HPUs. +The pipeline employs a UNet3D structure and generates videos through an iterative denoising process. ```bash -PT_HPU_MAX_COMPOUND_OP_SIZE=1 python text_to_image_generation.py \ - --model_name_or_path ptx0/pseudo-journey-v2 \ - --prompts "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" \ - --num_images_per_prompt 1 \ - --batch_size 1 \ +python text_to_video_generation.py \ + --model_name_or_path ali-vilab/text-to-video-ms-1.7b \ + --prompts "An astronaut riding a horse" \ --use_habana \ --use_hpu_graphs \ - --image_save_dir /tmp/stable_diffusion_images_brightness \ - --seed 33 \ - --use_zero_snr \ - --guidance_scale 0.7 \ - --timestep_spacing trailing + --dtype bf16 ``` -### Prompt weighting - -Here is how to run the example of prompt weighting. For more details, -please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/weighted_prompts). - -```bash -python text_to_image_generation.py \ - --model_name_or_path CompVis/stable-diffusion-v1-4 \ - --prompts "a red cat playing with a ball+++" "a red cat playing with a ball---" \ - --num_images_per_prompt 4 \ - --batch_size 4 \ - --use_habana --use_hpu_graphs \ - --image_save_dir /tmp/stable_diffusion_images_compel \ - --seed 33 \ - --sdp_on_bf16 \ - --bf16 \ - --num_inference_steps 20 \ - --use_compel -``` - -### Controlling image quality - -Here is how to run the example of improving image quality. For more details, -please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffusers/main/en/using-diffusers/image_quality). - -```bash -python text_to_image_generation.py \ - --model_name_or_path CompVis/stable-diffusion-v1-4 \ - --prompts "A squirrel eating a burger" \ - --num_images_per_prompt 4 \ - --batch_size 4 \ - --use_habana \ - --image_save_dir /tmp/stable_diffusion_images_freeu \ - --seed 33 \ - --use_freeu \ - --sdp_on_bf16 \ - --bf16 -``` # Stable Video Diffusion Examples Stable Video Diffusion (SVD) was unveiled in [Stable Video Diffusion Announcement](https://stability.ai/news/stable-video-diffusion-open-ai-video-model) @@ -834,6 +781,7 @@ python image_to_video_generation.py \ --bf16 ``` +> [!NOTE] > For improved performance of the image-to-video pipeline on Gaudi, it is recommended to configure the environment > by setting PT_HPU_MAX_COMPOUND_OP_SIZE to 1. @@ -845,10 +793,11 @@ Here is how to generate videos with several image prompts: PT_HPU_MAX_COMPOUND_OP_SIZE=1 \ python image_to_video_generation.py \ --model_name_or_path "stabilityai/stable-video-diffusion-img2vid-xt" \ - --image_path "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png" \ - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" \ - "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" \ - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" \ + --image_path \ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png" \ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png" \ + "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png" \ + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png" \ --num_videos_per_prompt 1 \ --video_save_dir /tmp/stable_video_diffusion_xt \ --save_frames_as_images \ @@ -859,46 +808,75 @@ python image_to_video_generation.py \ --bf16 ``` +> [!NOTE] > For improved performance of the image-to-video pipeline on Gaudi, it is recommended to configure the environment > by setting PT_HPU_MAX_COMPOUND_OP_SIZE to 1. -### Image-to-video ControlNet +### Image-to-Video ControlNet Here is how to generate video conditioned by depth: -``` +```bash python image_to_video_generation.py \ --model_name_or_path "stabilityai/stable-video-diffusion-img2vid" \ --controlnet_model_name_or_path "CiaraRowles/temporal-controlnet-depth-svd-v1" \ - --control_image_path "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_0.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_1.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_2.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_3.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_4.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_5.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_6.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_7.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_8.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_9.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_10.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_11.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_12.png?raw=true" \ - "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_13.png?raw=true" \ + --control_image_path \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_0.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_1.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_2.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_3.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_4.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_5.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_6.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_7.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_8.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_9.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_10.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_11.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_12.png?raw=true" \ + "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/depth/frame_13.png?raw=true" \ --image_path "https://github.com/CiaraStrawberry/svd-temporal-controlnet/blob/main/validation_demo/chair.png?raw=true" \ --video_save_dir SVD_controlnet \ --save_frames_as_images \ --use_habana \ --use_hpu_graphs \ --gaudi_config Habana/stable-diffusion \ - --bf16 \ --sdp_on_bf16 \ + --bf16 \ --num_frames 14 \ --motion_bucket_id=14 \ --width=512 \ --height=512 ``` -> [!NOTE] -> For Gaudi3 only: -> 1. Due to a known issue, batch sizes for models needs to be reduced. It will be fixed in the future release. -> 2. The Image-to-video ControlNet command is not enabled on Gaudi3. +# I2vgen-xl +I2vgen-xl is high quality Image-to-Video synthesis via cascaded diffusion models. Please refer to [Huggingface i2vgen-xl doc](https://huggingface.co/ali-vilab/i2vgen-xl). + +Here is how to generate video with one image and text prompt: + +```bash +PT_HPU_MAX_COMPOUND_OP_SIZE=1 \ +python image_to_video_generation.py \ + --model_name_or_path "ali-vilab/i2vgen-xl" \ + --image_path "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png" \ + --num_videos_per_prompt 1 \ + --video_save_dir ./i2vgen_xl \ + --num_inference_steps 50 \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --gif \ + --num_frames 16 \ + --prompts "Papers were floating in the air on a table in the library" \ + --negative_prompts "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms" \ + --seed 8888 \ + --sdp_on_bf16 \ + --bf16 +``` + +# Important Notes for Gaudi3 Users + +- **Batch Size Limitation**: Due to a known issue, batch sizes for some Stable Diffusion models need to be reduced. + This issue is expected to be resolved in a future release. + +- **Image-to-Video ControlNet**: The Image-to-Video ControlNet command is currently not supported on Gaudi3. diff --git a/examples/stable-diffusion/depth_to_image_generation.py b/examples/stable-diffusion/depth_to_image_generation.py index c32d61a05b..106863174e 100755 --- a/examples/stable-diffusion/depth_to_image_generation.py +++ b/examples/stable-diffusion/depth_to_image_generation.py @@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py index c76d3c0f5a..1e0d65e902 100755 --- a/examples/stable-diffusion/image_to_image_generation.py +++ b/examples/stable-diffusion/image_to_image_generation.py @@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") logger = logging.getLogger(__name__) @@ -223,10 +223,10 @@ def main(): args = parser.parse_args() # Set image resolution - res = {} + kwargs_call = {} if args.width > 0 and args.height > 0: - res["width"] = args.width - res["height"] = args.height + kwargs_call["width"] = args.width + kwargs_call["height"] = args.height sdxl_models = ["stable-diffusion-xl", "sdxl"] sdxl = False flux_models = ["FLUX.1"] @@ -236,6 +236,7 @@ def main(): "use_habana": args.use_habana, "use_hpu_graphs": args.use_hpu_graphs, "gaudi_config": args.gaudi_config_name, + "sdp_on_bf16": args.sdp_on_bf16, } # Import selected pipeline @@ -251,7 +252,7 @@ def main(): from optimum.habana.diffusers import GaudiStableDiffusionInstructPix2PixPipeline as Img2ImgPipeline kwargs["safety_checker"] = None - res["image_guidance_scale"] = args.image_guidance_scale + kwargs_call["image_guidance_scale"] = args.image_guidance_scale elif "image-variations" in args.model_name_or_path: from optimum.habana.diffusers import GaudiStableDiffusionImageVariationPipeline as Img2ImgPipeline @@ -290,7 +291,7 @@ def main(): kwargs["torch_dtype"] = torch.bfloat16 if args.throughput_warmup_steps is not None: - kwargs["throughput_warmup_steps"] = args.throughput_warmup_steps + kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps pipeline = Img2ImgPipeline.from_pretrained( args.model_name_or_path, @@ -324,8 +325,7 @@ def main(): output_type=args.output_type, profiling_warmup_steps=args.profiling_warmup_steps, profiling_steps=args.profiling_steps, - sdp_on_bf16=args.sdp_on_bf16, - **res, + **kwargs_call, ) elif flux: outputs = pipeline( @@ -340,7 +340,7 @@ def main(): output_type=args.output_type, profiling_warmup_steps=args.profiling_warmup_steps, profiling_steps=args.profiling_steps, - **res, + **kwargs_call, ) else: outputs = pipeline( @@ -355,7 +355,7 @@ def main(): output_type=args.output_type, profiling_warmup_steps=args.profiling_warmup_steps, profiling_steps=args.profiling_steps, - **res, + **kwargs_call, ) # Save the pipeline in the specified directory if not None @@ -370,12 +370,12 @@ def main(): logger.info(f"Saving images in {image_save_dir.resolve()}...") if args.ldm3d: for i, rgb in enumerate(outputs.rgb): - rgb.save(image_save_dir / f"rgb_{i+1}.png") + rgb.save(image_save_dir / f"rgb_{i + 1}.png") for i, depth in enumerate(outputs.depth): - depth.save(image_save_dir / f"depth_{i+1}.png") + depth.save(image_save_dir / f"depth_{i + 1}.png") else: for i, image in enumerate(outputs.images): - image.save(image_save_dir / f"image_{i+1}.png") + image.save(image_save_dir / f"image_{i + 1}.png") else: logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.") diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py index bd704a301b..8185a7260e 100755 --- a/examples/stable-diffusion/image_to_video_generation.py +++ b/examples/stable-diffusion/image_to_video_generation.py @@ -19,9 +19,13 @@ from pathlib import Path import torch -from diffusers.utils import export_to_video, load_image +from diffusers.utils import export_to_gif, export_to_video, load_image -from optimum.habana.diffusers import GaudiEulerDiscreteScheduler, GaudiStableVideoDiffusionPipeline +from optimum.habana.diffusers import ( + GaudiEulerDiscreteScheduler, + GaudiI2VGenXLPipeline, + GaudiStableVideoDiffusionPipeline, +) from optimum.habana.utils import set_seed @@ -34,7 +38,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") logger = logging.getLogger(__name__) @@ -57,6 +61,20 @@ def main(): ) # Pipeline arguments + parser.add_argument( + "--prompts", + type=str, + nargs="*", + default="Papers were floating in the air on a table in the library", + help="The prompt or prompts to guide the image generation.", + ) + parser.add_argument( + "--negative_prompts", + type=str, + nargs="*", + default="Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms", + help="The prompt or prompts not to guide the image generation.", + ) parser.add_argument( "--image_path", type=str, @@ -177,6 +195,7 @@ def main(): ), ) parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.") + parser.add_argument("--gif", action="store_true", help="Whether to generate the video in gif format.") parser.add_argument( "--sdp_on_bf16", action="store_true", @@ -184,6 +203,24 @@ def main(): help="Allow pyTorch to use reduced precision in the SDPA math backend", ) parser.add_argument("--num_frames", type=int, default=25, help="The number of video frames to generate.") + parser.add_argument( + "--profiling_warmup_steps", + default=0, + type=int, + help="Number of steps to ignore for profiling.", + ) + parser.add_argument( + "--profiling_steps", + default=0, + type=int, + help="Number of steps to capture for profiling.", + ) + parser.add_argument( + "--throughput_warmup_steps", + type=int, + default=None, + help="Number of steps to ignore for throughput calculation.", + ) args = parser.parse_args() # Setup logging @@ -194,6 +231,9 @@ def main(): ) logger.setLevel(logging.INFO) + i2v_models = ["i2vgen-xl"] + is_i2v_model = any(model in args.model_name_or_path for model in i2v_models) + # Load input image(s) input = [] logger.info("Input image(s):") @@ -201,7 +241,10 @@ def main(): args.image_path = [args.image_path] for image_path in args.image_path: image = load_image(image_path) - image = image.resize((args.height, args.width)) + if is_i2v_model: + image = image.convert("RGB") + else: + image = image.resize((args.height, args.width)) input.append(image) logger.info(image_path) @@ -263,11 +306,32 @@ def main(): output_type=args.output_type, num_frames=args.num_frames, ) + elif is_i2v_model: + del kwargs["scheduler"] + pipeline = GaudiI2VGenXLPipeline.from_pretrained( + args.model_name_or_path, + **kwargs, + ) + generator = torch.manual_seed(args.seed) + outputs = pipeline( + prompt=args.prompts, + image=input, + num_videos_per_prompt=args.num_videos_per_prompt, + batch_size=args.batch_size, + num_frames=args.num_frames, + num_inference_steps=args.num_inference_steps, + negative_prompt=args.negative_prompts, + guidance_scale=9.0, + generator=generator, + ) else: pipeline = GaudiStableVideoDiffusionPipeline.from_pretrained( args.model_name_or_path, **kwargs, ) + kwargs_call = {} + if args.throughput_warmup_steps is not None: + kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps # Generate images outputs = pipeline( @@ -284,6 +348,9 @@ def main(): noise_aug_strength=args.noise_aug_strength, decode_chunk_size=args.decode_chunk_size, output_type=args.output_type, + profiling_warmup_steps=args.profiling_warmup_steps, + profiling_steps=args.profiling_steps, + **kwargs_call, ) # Save the pipeline in the specified directory if not None @@ -297,7 +364,11 @@ def main(): video_save_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Saving video frames in {video_save_dir.resolve()}...") for i, frames in enumerate(outputs.frames): - export_to_video(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".mp4", fps=7) + if args.gif: + export_to_gif(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".gif") + else: + export_to_video(frames, args.video_save_dir + "/gen_video_" + str(i).zfill(2) + ".mp4", fps=7) + if args.save_frames_as_images: for j, frame in enumerate(frames): frame.save( diff --git a/examples/stable-diffusion/quantization/flux/measure_config.json b/examples/stable-diffusion/quantization/flux/measure_config.json index 865078d99f..f90605dba8 100644 --- a/examples/stable-diffusion/quantization/flux/measure_config.json +++ b/examples/stable-diffusion/quantization/flux/measure_config.json @@ -1,5 +1,5 @@ { "method": "HOOKS", "mode": "MEASURE", - "dump_stats_path": "quantization/flux/measure_all/fp8" + "dump_stats_path": "quantization/flux/measure/fp8" } diff --git a/examples/stable-diffusion/quantization/flux/quantize_config.json b/examples/stable-diffusion/quantization/flux/quantize_config.json index 8fdb21fccf..e601db4ba4 100644 --- a/examples/stable-diffusion/quantization/flux/quantize_config.json +++ b/examples/stable-diffusion/quantization/flux/quantize_config.json @@ -2,5 +2,5 @@ "method": "HOOKS", "mode": "QUANTIZE", "scale_method": "maxabs_hw_opt_weight", - "dump_stats_path": "quantization/flux/measure_all/fp8" + "dump_stats_path": "quantization/flux/measure/fp8" } diff --git a/examples/stable-diffusion/quantization/stable-diffusion-3/measure_config.json b/examples/stable-diffusion/quantization/stable-diffusion-3/measure_config.json new file mode 100644 index 0000000000..ebf3baa292 --- /dev/null +++ b/examples/stable-diffusion/quantization/stable-diffusion-3/measure_config.json @@ -0,0 +1,5 @@ +{ + "method": "HOOKS", + "mode": "MEASURE", + "dump_stats_path": "quantization/stable-diffusion-3/measure_all/fp8" +} \ No newline at end of file diff --git a/examples/stable-diffusion/quantization/stable-diffusion-3/quantize_config.json b/examples/stable-diffusion/quantization/stable-diffusion-3/quantize_config.json new file mode 100644 index 0000000000..1fa98ebce0 --- /dev/null +++ b/examples/stable-diffusion/quantization/stable-diffusion-3/quantize_config.json @@ -0,0 +1,6 @@ +{ + "method": "HOOKS", + "mode": "QUANTIZE", + "scale_method": "maxabs_hw_opt_weight", + "dump_stats_path": "quantization/stable-diffusion-3/measure_all/fp8" +} \ No newline at end of file diff --git a/examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.json b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.json similarity index 95% rename from examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.json rename to examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.json index 91a74c633c..62c76a2685 100644 --- a/examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.json +++ b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.json @@ -7,7 +7,7 @@ "inputs": [ [ [ - 4.78125 + 7.46875 ] ] ], @@ -55,7 +55,7 @@ "inputs": [ [ [ - 7.40625 + 6.25 ] ] ], @@ -71,7 +71,7 @@ "inputs": [ [ [ - 4.78125 + 4.0 ] ] ], @@ -87,7 +87,7 @@ "inputs": [ [ [ - 9.125 + 5.9375 ] ] ], @@ -119,7 +119,7 @@ "inputs": [ [ [ - 7.875 + 12.5 ] ] ], @@ -135,7 +135,7 @@ "inputs": [ [ [ - 9.625 + 7.6875 ] ] ], @@ -167,7 +167,7 @@ "inputs": [ [ [ - 8.4375 + 11.75 ] ] ], @@ -183,7 +183,7 @@ "inputs": [ [ [ - 6.9375 + 13.8125 ] ] ], @@ -199,7 +199,7 @@ "inputs": [ [ [ - 7.9375 + 6.4375 ] ] ], @@ -215,7 +215,7 @@ "inputs": [ [ [ - 9.25 + 8.1875 ] ] ], @@ -231,7 +231,7 @@ "inputs": [ [ [ - 9.25 + 8.1875 ] ] ], @@ -247,7 +247,7 @@ "inputs": [ [ [ - 9.25 + 8.1875 ] ] ], @@ -263,7 +263,7 @@ "inputs": [ [ [ - 4.59375 + 6.21875 ] ] ], @@ -279,12 +279,12 @@ "inputs": [ [ [ - 9.1875 + 7.65625 ] ], [ [ - 9.5625 + 9.4375 ] ] ] @@ -293,12 +293,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 5.75 + 6.34375 ] ] ] @@ -307,14 +307,14 @@ "inputs": [ [ [ - 314.0 + 270.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -323,7 +323,7 @@ "inputs": [ [ [ - 11.375 + 9.0 ] ] ], @@ -371,7 +371,7 @@ "inputs": [ [ [ - 4.875 + 20.125 ] ] ], @@ -392,7 +392,7 @@ ], [ [ - 5.6875 + 6.84375 ] ] ] @@ -415,7 +415,7 @@ "inputs": [ [ [ - 856.0 + 976.0 ] ] ], @@ -431,7 +431,7 @@ "inputs": [ [ [ - 7.75 + 6.3125 ] ] ], @@ -447,7 +447,7 @@ "inputs": [ [ [ - 32.0 + 23.75 ] ] ], @@ -463,7 +463,7 @@ "inputs": [ [ [ - 10.5625 + 9.1875 ] ] ], @@ -479,7 +479,7 @@ "inputs": [ [ [ - 10.5625 + 9.1875 ] ] ], @@ -495,7 +495,7 @@ "inputs": [ [ [ - 10.5625 + 9.1875 ] ] ], @@ -511,7 +511,7 @@ "inputs": [ [ [ - 4.875 + 5.53125 ] ] ], @@ -527,12 +527,12 @@ "inputs": [ [ [ - 7.8125 + 7.0 ] ], [ [ - 9.5 + 9.0 ] ] ] @@ -541,12 +541,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 6.21875 + 5.65625 ] ] ] @@ -555,14 +555,14 @@ "inputs": [ [ [ - 376.0 + 266.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -571,7 +571,7 @@ "inputs": [ [ [ - 14.75 + 11.875 ] ] ], @@ -640,7 +640,7 @@ ], [ [ - 4.65625 + 5.375 ] ] ] @@ -663,7 +663,7 @@ "inputs": [ [ [ - 2576.0 + 2288.0 ] ] ], @@ -679,7 +679,7 @@ "inputs": [ [ [ - 9.5 + 7.75 ] ] ], @@ -695,7 +695,7 @@ "inputs": [ [ [ - 70.5 + 28.0 ] ] ], @@ -711,7 +711,7 @@ "inputs": [ [ [ - 20.375 + 22.875 ] ] ], @@ -727,7 +727,7 @@ "inputs": [ [ [ - 11.75 + 6.09375 ] ] ], @@ -743,7 +743,7 @@ "inputs": [ [ [ - 7.40625 + 6.5625 ] ] ], @@ -759,7 +759,7 @@ "inputs": [ [ [ - 7.40625 + 6.5625 ] ] ], @@ -775,7 +775,7 @@ "inputs": [ [ [ - 7.40625 + 6.5625 ] ] ], @@ -791,7 +791,7 @@ "inputs": [ [ [ - 4.15625 + 6.59375 ] ] ], @@ -807,12 +807,12 @@ "inputs": [ [ [ - 7.96875 + 8.1875 ] ], [ [ - 7.90625 + 8.3125 ] ] ] @@ -821,12 +821,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 6.40625 + 7.125 ] ] ] @@ -835,14 +835,14 @@ "inputs": [ [ [ - 199.0 + 168.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -851,7 +851,7 @@ "inputs": [ [ [ - 12.3125 + 9.125 ] ] ], @@ -899,7 +899,7 @@ "inputs": [ [ [ - 4.375 + 15.125 ] ] ], @@ -920,7 +920,7 @@ ], [ [ - 4.125 + 4.78125 ] ] ] @@ -929,7 +929,7 @@ "inputs": [ [ [ - 0.953125 + 1.0 ] ], [ @@ -943,14 +943,14 @@ "inputs": [ [ [ - 1864.0 + 1936.0 ] ] ], "outputs": [ [ [ - 0.953125 + 1.0 ] ] ] @@ -959,7 +959,7 @@ "inputs": [ [ [ - 6.8125 + 5.34375 ] ] ], @@ -975,7 +975,7 @@ "inputs": [ [ [ - 19.375 + 27.625 ] ] ], @@ -991,7 +991,7 @@ "inputs": [ [ [ - 9.1875 + 8.0625 ] ] ], @@ -1007,7 +1007,7 @@ "inputs": [ [ [ - 9.1875 + 8.0625 ] ] ], @@ -1023,7 +1023,7 @@ "inputs": [ [ [ - 9.1875 + 8.0625 ] ] ], @@ -1039,7 +1039,7 @@ "inputs": [ [ [ - 4.71875 + 7.21875 ] ] ], @@ -1055,12 +1055,12 @@ "inputs": [ [ [ - 8.875 + 8.5 ] ], [ [ - 8.75 + 9.875 ] ] ] @@ -1069,12 +1069,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 6.03125 + 7.3125 ] ] ] @@ -1083,14 +1083,14 @@ "inputs": [ [ [ - 173.0 + 182.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -1099,7 +1099,7 @@ "inputs": [ [ [ - 13.25 + 10.1875 ] ] ], @@ -1147,7 +1147,7 @@ "inputs": [ [ [ - 8.8125 + 17.5 ] ] ], @@ -1168,7 +1168,7 @@ ], [ [ - 4.46875 + 4.9375 ] ] ] @@ -1191,7 +1191,7 @@ "inputs": [ [ [ - 1792.0 + 2080.0 ] ] ], @@ -1207,7 +1207,7 @@ "inputs": [ [ [ - 5.84375 + 4.90625 ] ] ], @@ -1223,7 +1223,7 @@ "inputs": [ [ [ - 33.0 + 28.75 ] ] ], @@ -1239,7 +1239,7 @@ "inputs": [ [ [ - 22.0 + 26.0 ] ] ], @@ -1255,7 +1255,7 @@ "inputs": [ [ [ - 15.0 + 9.625 ] ] ], @@ -1287,7 +1287,7 @@ "inputs": [ [ [ - 7.8125 + 7.625 ] ] ], @@ -1303,7 +1303,7 @@ "inputs": [ [ [ - 19.0 + 33.25 ] ] ], @@ -1319,7 +1319,7 @@ "inputs": [ [ [ - 7.46875 + 5.90625 ] ] ], @@ -1351,7 +1351,7 @@ "inputs": [ [ [ - 7.84375 + 7.5 ] ] ], @@ -1367,7 +1367,7 @@ "inputs": [ [ [ - 27.875 + 46.5 ] ] ], @@ -1383,7 +1383,7 @@ "inputs": [ [ [ - 9.0625 + 8.1875 ] ] ], @@ -1399,7 +1399,7 @@ "inputs": [ [ [ - 3.359375 + 3.390625 ] ] ], @@ -1415,7 +1415,7 @@ "inputs": [ [ [ - 3.359375 + 3.390625 ] ] ], @@ -1431,7 +1431,7 @@ "inputs": [ [ [ - 3.359375 + 3.390625 ] ] ], @@ -1447,7 +1447,7 @@ "inputs": [ [ [ - 6.1875 + 7.21875 ] ] ], @@ -1463,12 +1463,12 @@ "inputs": [ [ [ - 8.1875 + 6.4375 ] ], [ [ - 8.1875 + 7.15625 ] ] ] @@ -1482,7 +1482,7 @@ ], [ [ - 6.8125 + 7.21875 ] ] ] @@ -1491,7 +1491,7 @@ "inputs": [ [ [ - 282.0 + 144.0 ] ] ], @@ -1507,7 +1507,7 @@ "inputs": [ [ [ - 9.25 + 6.71875 ] ] ], @@ -1555,7 +1555,7 @@ "inputs": [ [ [ - 12.3125 + 6.09375 ] ] ], @@ -1576,7 +1576,7 @@ ], [ [ - 7.71875 + 6.9375 ] ] ] @@ -1599,7 +1599,7 @@ "inputs": [ [ [ - 1904.0 + 1776.0 ] ] ], @@ -1615,7 +1615,7 @@ "inputs": [ [ [ - 3.0625 + 2.703125 ] ] ], @@ -1631,7 +1631,7 @@ "inputs": [ [ [ - 23.875 + 27.125 ] ] ], @@ -1647,7 +1647,7 @@ "inputs": [ [ [ - 4.96875 + 4.25 ] ] ], @@ -1663,7 +1663,7 @@ "inputs": [ [ [ - 4.96875 + 4.25 ] ] ], @@ -1679,7 +1679,7 @@ "inputs": [ [ [ - 4.96875 + 4.25 ] ] ], @@ -1695,7 +1695,7 @@ "inputs": [ [ [ - 6.0 + 3.921875 ] ] ], @@ -1711,12 +1711,12 @@ "inputs": [ [ [ - 5.96875 + 5.15625 ] ], [ [ - 6.28125 + 4.875 ] ] ] @@ -1725,12 +1725,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 8.6875 + 3.921875 ] ] ] @@ -1739,14 +1739,14 @@ "inputs": [ [ [ - 155.0 + 101.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -1755,7 +1755,7 @@ "inputs": [ [ [ - 11.125 + 8.0 ] ] ], @@ -1803,7 +1803,7 @@ "inputs": [ [ [ - 5.96875 + 15.5625 ] ] ], @@ -1824,7 +1824,7 @@ ], [ [ - 5.40625 + 5.53125 ] ] ] @@ -1847,7 +1847,7 @@ "inputs": [ [ [ - 1192.0 + 1240.0 ] ] ], @@ -1863,7 +1863,7 @@ "inputs": [ [ [ - 2.84375 + 3.03125 ] ] ], @@ -1879,7 +1879,7 @@ "inputs": [ [ [ - 18.125 + 26.125 ] ] ], @@ -1895,7 +1895,7 @@ "inputs": [ [ [ - 7.875 + 5.90625 ] ] ], @@ -1911,7 +1911,7 @@ "inputs": [ [ [ - 7.875 + 5.90625 ] ] ], @@ -1927,7 +1927,7 @@ "inputs": [ [ [ - 7.875 + 5.90625 ] ] ], @@ -1943,7 +1943,7 @@ "inputs": [ [ [ - 5.125 + 5.375 ] ] ], @@ -1959,12 +1959,12 @@ "inputs": [ [ [ - 7.375 + 5.96875 ] ], [ [ - 7.65625 + 6.21875 ] ] ] @@ -1978,7 +1978,7 @@ ], [ [ - 6.625 + 5.5625 ] ] ] @@ -1987,7 +1987,7 @@ "inputs": [ [ [ - 194.0 + 135.0 ] ] ], @@ -2003,7 +2003,7 @@ "inputs": [ [ [ - 14.1875 + 8.875 ] ] ], @@ -2051,7 +2051,7 @@ "inputs": [ [ [ - 4.875 + 18.75 ] ] ], @@ -2072,7 +2072,7 @@ ], [ [ - 4.78125 + 5.59375 ] ] ] @@ -2095,7 +2095,7 @@ "inputs": [ [ [ - 980.0 + 884.0 ] ] ], @@ -2111,7 +2111,7 @@ "inputs": [ [ [ - 3.375 + 3.296875 ] ] ], @@ -2127,7 +2127,7 @@ "inputs": [ [ [ - 22.75 + 16.125 ] ] ], @@ -2143,7 +2143,7 @@ "inputs": [ [ [ - 8.875 + 6.375 ] ] ], @@ -2159,7 +2159,7 @@ "inputs": [ [ [ - 8.875 + 6.375 ] ] ], @@ -2175,7 +2175,7 @@ "inputs": [ [ [ - 8.875 + 6.375 ] ] ], @@ -2191,7 +2191,7 @@ "inputs": [ [ [ - 6.53125 + 6.0 ] ] ], @@ -2207,12 +2207,12 @@ "inputs": [ [ [ - 7.5625 + 6.25 ] ], [ [ - 7.71875 + 6.09375 ] ] ] @@ -2226,7 +2226,7 @@ ], [ [ - 7.96875 + 6.75 ] ] ] @@ -2235,7 +2235,7 @@ "inputs": [ [ [ - 188.0 + 127.5 ] ] ], @@ -2251,7 +2251,7 @@ "inputs": [ [ [ - 14.875 + 8.4375 ] ] ], @@ -2299,7 +2299,7 @@ "inputs": [ [ [ - 4.53125 + 18.375 ] ] ], @@ -2320,7 +2320,7 @@ ], [ [ - 6.28125 + 5.34375 ] ] ] @@ -2343,7 +2343,7 @@ "inputs": [ [ [ - 2528.0 + 2208.0 ] ] ], @@ -2359,7 +2359,7 @@ "inputs": [ [ [ - 3.671875 + 3.390625 ] ] ], @@ -2375,7 +2375,7 @@ "inputs": [ [ [ - 26.375 + 51.5 ] ] ], @@ -2391,7 +2391,7 @@ "inputs": [ [ [ - 6.875 + 5.0 ] ] ], @@ -2407,7 +2407,7 @@ "inputs": [ [ [ - 6.875 + 5.0 ] ] ], @@ -2423,7 +2423,7 @@ "inputs": [ [ [ - 6.875 + 5.0 ] ] ], @@ -2439,7 +2439,7 @@ "inputs": [ [ [ - 6.0625 + 5.71875 ] ] ], @@ -2455,12 +2455,12 @@ "inputs": [ [ [ - 7.0 + 6.25 ] ], [ [ - 7.15625 + 7.03125 ] ] ] @@ -2469,12 +2469,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 6.96875 + 5.71875 ] ] ] @@ -2483,14 +2483,14 @@ "inputs": [ [ [ - 185.0 + 129.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -2499,7 +2499,7 @@ "inputs": [ [ [ - 16.375 + 7.9375 ] ] ], @@ -2547,7 +2547,7 @@ "inputs": [ [ [ - 3.796875 + 22.875 ] ] ], @@ -2568,7 +2568,7 @@ ], [ [ - 4.5625 + 4.875 ] ] ] @@ -2591,7 +2591,7 @@ "inputs": [ [ [ - 1448.0 + 1240.0 ] ] ], @@ -2607,7 +2607,7 @@ "inputs": [ [ [ - 3.53125 + 3.296875 ] ] ], @@ -2623,7 +2623,7 @@ "inputs": [ [ [ - 26.125 + 37.5 ] ] ], @@ -2639,7 +2639,7 @@ "inputs": [ [ [ - 7.28125 + 4.875 ] ] ], @@ -2655,7 +2655,7 @@ "inputs": [ [ [ - 7.28125 + 4.875 ] ] ], @@ -2671,7 +2671,7 @@ "inputs": [ [ [ - 7.28125 + 4.875 ] ] ], @@ -2687,7 +2687,7 @@ "inputs": [ [ [ - 4.5625 + 5.3125 ] ] ], @@ -2703,12 +2703,12 @@ "inputs": [ [ [ - 6.65625 + 5.625 ] ], [ [ - 7.3125 + 5.875 ] ] ] @@ -2722,7 +2722,7 @@ ], [ [ - 5.6875 + 5.375 ] ] ] @@ -2731,7 +2731,7 @@ "inputs": [ [ [ - 172.0 + 108.0 ] ] ], @@ -2747,7 +2747,7 @@ "inputs": [ [ [ - 17.75 + 7.375 ] ] ], @@ -2795,7 +2795,7 @@ "inputs": [ [ [ - 3.34375 + 20.25 ] ] ], @@ -2816,7 +2816,7 @@ ], [ [ - 4.09375 + 3.859375 ] ] ] @@ -2839,7 +2839,7 @@ "inputs": [ [ [ - 1104.0 + 932.0 ] ] ], @@ -2855,7 +2855,7 @@ "inputs": [ [ [ - 3.890625 + 3.703125 ] ] ], @@ -2871,7 +2871,7 @@ "inputs": [ [ [ - 22.75 + 61.25 ] ] ], @@ -2887,7 +2887,7 @@ "inputs": [ [ [ - 8.3125 + 5.125 ] ] ], @@ -2903,7 +2903,7 @@ "inputs": [ [ [ - 8.3125 + 5.125 ] ] ], @@ -2919,7 +2919,7 @@ "inputs": [ [ [ - 8.3125 + 5.125 ] ] ], @@ -2935,7 +2935,7 @@ "inputs": [ [ [ - 4.625 + 4.34375 ] ] ], @@ -2951,12 +2951,12 @@ "inputs": [ [ [ - 6.21875 + 5.1875 ] ], [ [ - 6.5625 + 6.40625 ] ] ] @@ -2965,12 +2965,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 5.3125 + 4.46875 ] ] ] @@ -2979,14 +2979,14 @@ "inputs": [ [ [ - 149.0 + 128.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -2995,7 +2995,7 @@ "inputs": [ [ [ - 20.25 + 8.0 ] ] ], @@ -3043,7 +3043,7 @@ "inputs": [ [ [ - 1.21875 + 1.765625 ] ] ], @@ -3064,7 +3064,7 @@ ], [ [ - 3.640625 + 2.9375 ] ] ] @@ -3073,7 +3073,7 @@ "inputs": [ [ [ - 0.98046875 + 1.0 ] ], [ @@ -3087,14 +3087,14 @@ "inputs": [ [ [ - 940.0 + 692.0 ] ] ], "outputs": [ [ [ - 0.98046875 + 1.0 ] ] ] @@ -3103,7 +3103,7 @@ "inputs": [ [ [ - 3.59375 + 3.40625 ] ] ], @@ -3119,7 +3119,7 @@ "inputs": [ [ [ - 22.125 + 61.75 ] ] ], @@ -3135,7 +3135,7 @@ "inputs": [ [ [ - 8.9375 + 4.625 ] ] ], @@ -3151,7 +3151,7 @@ "inputs": [ [ [ - 8.9375 + 4.625 ] ] ], @@ -3167,7 +3167,7 @@ "inputs": [ [ [ - 8.9375 + 4.625 ] ] ], @@ -3183,7 +3183,7 @@ "inputs": [ [ [ - 3.71875 + 4.6875 ] ] ], @@ -3199,12 +3199,12 @@ "inputs": [ [ [ - 5.9375 + 4.96875 ] ], [ [ - 5.75 + 5.53125 ] ] ] @@ -3213,12 +3213,12 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ [ - 5.03125 + 5.21875 ] ] ] @@ -3227,14 +3227,14 @@ "inputs": [ [ [ - 145.0 + 83.0 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -3243,7 +3243,7 @@ "inputs": [ [ [ - 18.75 + 7.21875 ] ] ], @@ -3291,7 +3291,7 @@ "inputs": [ [ [ - 3.296875 + 13.0 ] ] ], @@ -3312,7 +3312,7 @@ ], [ [ - 3.734375 + 2.6875 ] ] ] @@ -3321,7 +3321,7 @@ "inputs": [ [ [ - 0.953125 + 1.0 ] ], [ @@ -3335,14 +3335,14 @@ "inputs": [ [ [ - 988.0 + 628.0 ] ] ], "outputs": [ [ [ - 0.953125 + 1.0 ] ] ] @@ -3351,7 +3351,7 @@ "inputs": [ [ [ - 3.734375 + 3.75 ] ] ], @@ -3367,7 +3367,7 @@ "inputs": [ [ [ - 28.5 + 16.25 ] ] ], @@ -3383,7 +3383,7 @@ "inputs": [ [ [ - 8.5625 + 4.53125 ] ] ], @@ -3399,7 +3399,7 @@ "inputs": [ [ [ - 8.5625 + 4.53125 ] ] ], @@ -3415,7 +3415,7 @@ "inputs": [ [ [ - 8.5625 + 4.53125 ] ] ], @@ -3431,7 +3431,7 @@ "inputs": [ [ [ - 4.78125 + 5.625 ] ] ], @@ -3447,12 +3447,12 @@ "inputs": [ [ [ - 5.8125 + 4.78125 ] ], [ [ - 6.0625 + 4.78125 ] ] ] @@ -3461,12 +3461,12 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ [ - 5.71875 + 6.03125 ] ] ] @@ -3475,14 +3475,14 @@ "inputs": [ [ [ - 139.0 + 76.5 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -3491,7 +3491,7 @@ "inputs": [ [ [ - 19.0 + 6.65625 ] ] ], @@ -3539,7 +3539,7 @@ "inputs": [ [ [ - 1.515625 + 24.25 ] ] ], @@ -3560,7 +3560,7 @@ ], [ [ - 3.921875 + 2.890625 ] ] ] @@ -3569,7 +3569,7 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ @@ -3583,14 +3583,14 @@ "inputs": [ [ [ - 1368.0 + 908.0 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -3599,7 +3599,7 @@ "inputs": [ [ [ - 3.90625 + 3.84375 ] ] ], @@ -3615,7 +3615,7 @@ "inputs": [ [ [ - 24.75 + 29.5 ] ] ], @@ -3631,7 +3631,7 @@ "inputs": [ [ [ - 8.5 + 5.0 ] ] ], @@ -3647,7 +3647,7 @@ "inputs": [ [ [ - 8.5 + 5.0 ] ] ], @@ -3663,7 +3663,7 @@ "inputs": [ [ [ - 8.5 + 5.0 ] ] ], @@ -3679,7 +3679,7 @@ "inputs": [ [ [ - 4.65625 + 5.03125 ] ] ], @@ -3695,12 +3695,12 @@ "inputs": [ [ [ - 6.0625 + 4.96875 ] ], [ [ - 6.53125 + 4.875 ] ] ] @@ -3709,12 +3709,12 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ [ - 5.5625 + 5.0625 ] ] ] @@ -3723,14 +3723,14 @@ "inputs": [ [ [ - 129.0 + 88.5 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -3739,7 +3739,7 @@ "inputs": [ [ [ - 16.75 + 6.0625 ] ] ], @@ -3787,7 +3787,7 @@ "inputs": [ [ [ - 6.5 + 32.0 ] ] ], @@ -3808,7 +3808,7 @@ ], [ [ - 3.71875 + 2.59375 ] ] ] @@ -3817,7 +3817,7 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ @@ -3831,14 +3831,14 @@ "inputs": [ [ [ - 1312.0 + 1024.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -3847,7 +3847,7 @@ "inputs": [ [ [ - 5.15625 + 3.5625 ] ] ], @@ -3863,7 +3863,7 @@ "inputs": [ [ [ - 30.125 + 20.0 ] ] ], @@ -3879,7 +3879,7 @@ "inputs": [ [ [ - 44.25 + 31.5 ] ] ], @@ -3895,7 +3895,7 @@ "inputs": [ [ [ - 8.0625 + 9.0 ] ] ], @@ -3911,7 +3911,7 @@ "inputs": [ [ [ - 3.59375 + 3.0625 ] ] ], @@ -3927,7 +3927,7 @@ "inputs": [ [ [ - 3.59375 + 3.0625 ] ] ], @@ -3943,7 +3943,7 @@ "inputs": [ [ [ - 3.59375 + 3.0625 ] ] ], @@ -3959,7 +3959,7 @@ "inputs": [ [ [ - 4.125 + 4.53125 ] ] ], @@ -3975,12 +3975,12 @@ "inputs": [ [ [ - 6.90625 + 6.6875 ] ], [ [ - 8.4375 + 7.21875 ] ] ] @@ -3989,12 +3989,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 5.4375 + 5.0 ] ] ] @@ -4003,14 +4003,14 @@ "inputs": [ [ [ - 230.0 + 203.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -4019,7 +4019,7 @@ "inputs": [ [ [ - 12.5625 + 8.625 ] ] ], @@ -4067,7 +4067,7 @@ "inputs": [ [ [ - 5.875 + 23.375 ] ] ], @@ -4088,7 +4088,7 @@ ], [ [ - 6.9375 + 5.34375 ] ] ] @@ -4111,7 +4111,7 @@ "inputs": [ [ [ - 1608.0 + 1584.0 ] ] ], @@ -4127,7 +4127,7 @@ "inputs": [ [ [ - 5.5625 + 3.96875 ] ] ], @@ -4143,7 +4143,7 @@ "inputs": [ [ [ - 34.5 + 16.625 ] ] ], @@ -4159,7 +4159,7 @@ "inputs": [ [ [ - 5.09375 + 4.03125 ] ] ], @@ -4175,7 +4175,7 @@ "inputs": [ [ [ - 5.09375 + 4.03125 ] ] ], @@ -4191,7 +4191,7 @@ "inputs": [ [ [ - 5.09375 + 4.03125 ] ] ], @@ -4207,7 +4207,7 @@ "inputs": [ [ [ - 4.3125 + 4.5 ] ] ], @@ -4223,12 +4223,12 @@ "inputs": [ [ [ - 7.3125 + 5.9375 ] ], [ [ - 6.4375 + 6.59375 ] ] ] @@ -4237,12 +4237,12 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ [ - 6.15625 + 5.25 ] ] ] @@ -4251,14 +4251,14 @@ "inputs": [ [ [ - 166.0 + 107.0 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -4267,7 +4267,7 @@ "inputs": [ [ [ - 16.0 + 7.9375 ] ] ], @@ -4315,7 +4315,7 @@ "inputs": [ [ [ - 6.5625 + 5.96875 ] ] ], @@ -4336,7 +4336,7 @@ ], [ [ - 7.71875 + 7.0 ] ] ] @@ -4359,7 +4359,7 @@ "inputs": [ [ [ - 1528.0 + 1424.0 ] ] ], @@ -4375,7 +4375,7 @@ "inputs": [ [ [ - 3.453125 + 3.140625 ] ] ], @@ -4391,7 +4391,7 @@ "inputs": [ [ [ - 30.125 + 26.375 ] ] ], @@ -4407,7 +4407,7 @@ "inputs": [ [ [ - 6.125 + 3.890625 ] ] ], @@ -4423,7 +4423,7 @@ "inputs": [ [ [ - 6.125 + 3.890625 ] ] ], @@ -4439,7 +4439,7 @@ "inputs": [ [ [ - 6.125 + 3.890625 ] ] ], @@ -4455,7 +4455,7 @@ "inputs": [ [ [ - 4.96875 + 4.65625 ] ] ], @@ -4471,12 +4471,12 @@ "inputs": [ [ [ - 6.5625 + 5.3125 ] ], [ [ - 7.21875 + 5.28125 ] ] ] @@ -4485,12 +4485,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 6.8125 + 5.625 ] ] ] @@ -4499,14 +4499,14 @@ "inputs": [ [ [ - 157.0 + 96.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -4515,7 +4515,7 @@ "inputs": [ [ [ - 17.375 + 9.8125 ] ] ], @@ -4563,7 +4563,7 @@ "inputs": [ [ [ - 17.375 + 17.5 ] ] ], @@ -4584,7 +4584,7 @@ ], [ [ - 9.5 + 6.21875 ] ] ] @@ -4607,7 +4607,7 @@ "inputs": [ [ [ - 1400.0 + 1200.0 ] ] ], @@ -4623,7 +4623,7 @@ "inputs": [ [ [ - 3.6875 + 3.46875 ] ] ], @@ -4639,7 +4639,7 @@ "inputs": [ [ [ - 46.0 + 33.25 ] ] ], @@ -4655,7 +4655,7 @@ "inputs": [ [ [ - 6.0625 + 3.640625 ] ] ], @@ -4671,7 +4671,7 @@ "inputs": [ [ [ - 6.0625 + 3.640625 ] ] ], @@ -4687,7 +4687,7 @@ "inputs": [ [ [ - 6.0625 + 3.640625 ] ] ], @@ -4703,7 +4703,7 @@ "inputs": [ [ [ - 4.1875 + 4.15625 ] ] ], @@ -4719,12 +4719,12 @@ "inputs": [ [ [ - 6.40625 + 5.34375 ] ], [ [ - 6.53125 + 5.125 ] ] ] @@ -4733,12 +4733,12 @@ "inputs": [ [ [ - 0.98046875 + 1.0 ] ], [ [ - 5.875 + 4.46875 ] ] ] @@ -4747,14 +4747,14 @@ "inputs": [ [ [ - 157.0 + 80.5 ] ] ], "outputs": [ [ [ - 0.98046875 + 1.0 ] ] ] @@ -4763,7 +4763,7 @@ "inputs": [ [ [ - 17.75 + 11.375 ] ] ], @@ -4811,7 +4811,7 @@ "inputs": [ [ [ - 13.25 + 7.3125 ] ] ], @@ -4832,7 +4832,7 @@ ], [ [ - 8.0625 + 5.5625 ] ] ] @@ -4855,7 +4855,7 @@ "inputs": [ [ [ - 1624.0 + 1440.0 ] ] ], @@ -4871,7 +4871,7 @@ "inputs": [ [ [ - 4.03125 + 3.796875 ] ] ], @@ -4887,7 +4887,7 @@ "inputs": [ [ [ - 54.25 + 26.875 ] ] ], @@ -4903,7 +4903,7 @@ "inputs": [ [ [ - 6.65625 + 4.09375 ] ] ], @@ -4919,7 +4919,7 @@ "inputs": [ [ [ - 6.65625 + 4.09375 ] ] ], @@ -4935,7 +4935,7 @@ "inputs": [ [ [ - 6.65625 + 4.09375 ] ] ], @@ -4951,7 +4951,7 @@ "inputs": [ [ [ - 4.34375 + 3.65625 ] ] ], @@ -4967,12 +4967,12 @@ "inputs": [ [ [ - 6.15625 + 4.4375 ] ], [ [ - 5.71875 + 4.9375 ] ] ] @@ -4981,12 +4981,12 @@ "inputs": [ [ [ - 0.96875 + 1.0 ] ], [ [ - 4.84375 + 3.859375 ] ] ] @@ -4995,14 +4995,14 @@ "inputs": [ [ [ - 152.0 + 79.5 ] ] ], "outputs": [ [ [ - 0.96875 + 1.0 ] ] ] @@ -5011,7 +5011,7 @@ "inputs": [ [ [ - 16.75 + 10.5 ] ] ], @@ -5059,7 +5059,7 @@ "inputs": [ [ [ - 18.125 + 14.125 ] ] ], @@ -5080,7 +5080,7 @@ ], [ [ - 6.40625 + 4.53125 ] ] ] @@ -5103,7 +5103,7 @@ "inputs": [ [ [ - 2304.0 + 1864.0 ] ] ], @@ -5119,7 +5119,7 @@ "inputs": [ [ [ - 4.1875 + 4.15625 ] ] ], @@ -5135,7 +5135,7 @@ "inputs": [ [ [ - 87.0 + 23.25 ] ] ], @@ -5151,7 +5151,7 @@ "inputs": [ [ [ - 8.375 + 5.03125 ] ] ], @@ -5167,7 +5167,7 @@ "inputs": [ [ [ - 8.375 + 5.03125 ] ] ], @@ -5183,7 +5183,7 @@ "inputs": [ [ [ - 8.375 + 5.03125 ] ] ], @@ -5199,7 +5199,7 @@ "inputs": [ [ [ - 4.09375 + 3.4375 ] ] ], @@ -5215,12 +5215,12 @@ "inputs": [ [ [ - 5.78125 + 4.75 ] ], [ [ - 6.03125 + 4.875 ] ] ] @@ -5229,12 +5229,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 5.5625 + 3.859375 ] ] ] @@ -5243,14 +5243,14 @@ "inputs": [ [ [ - 176.0 + 72.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -5259,7 +5259,7 @@ "inputs": [ [ [ - 14.5625 + 9.0625 ] ] ], @@ -5307,7 +5307,7 @@ "inputs": [ [ [ - 12.8125 + 13.25 ] ] ], @@ -5328,7 +5328,7 @@ ], [ [ - 6.71875 + 4.59375 ] ] ] @@ -5351,7 +5351,7 @@ "inputs": [ [ [ - 1872.0 + 1512.0 ] ] ], @@ -5367,7 +5367,7 @@ "inputs": [ [ [ - 4.84375 + 4.375 ] ] ], @@ -5383,7 +5383,7 @@ "inputs": [ [ [ - 92.5 + 21.875 ] ] ], @@ -5399,7 +5399,7 @@ "inputs": [ [ [ - 8.5625 + 5.65625 ] ] ], @@ -5415,7 +5415,7 @@ "inputs": [ [ [ - 8.5625 + 5.65625 ] ] ], @@ -5431,7 +5431,7 @@ "inputs": [ [ [ - 8.5625 + 5.65625 ] ] ], @@ -5447,7 +5447,7 @@ "inputs": [ [ [ - 4.375 + 3.84375 ] ] ], @@ -5463,12 +5463,12 @@ "inputs": [ [ [ - 6.3125 + 4.78125 ] ], [ [ - 6.3125 + 4.6875 ] ] ] @@ -5477,12 +5477,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 5.3125 + 4.21875 ] ] ] @@ -5491,14 +5491,14 @@ "inputs": [ [ [ - 166.0 + 91.5 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -5507,7 +5507,7 @@ "inputs": [ [ [ - 12.5 + 8.375 ] ] ], @@ -5576,7 +5576,7 @@ ], [ [ - 4.625 + 4.0625 ] ] ] @@ -5599,7 +5599,7 @@ "inputs": [ [ [ - 2096.0 + 1728.0 ] ] ], @@ -5615,7 +5615,7 @@ "inputs": [ [ [ - 5.75 + 4.59375 ] ] ], @@ -5631,7 +5631,7 @@ "inputs": [ [ [ - 111.0 + 17.5 ] ] ], @@ -5647,7 +5647,7 @@ "inputs": [ [ [ - 7.8125 + 5.5 ] ] ], @@ -5663,7 +5663,7 @@ "inputs": [ [ [ - 7.8125 + 5.5 ] ] ], @@ -5679,7 +5679,7 @@ "inputs": [ [ [ - 7.8125 + 5.5 ] ] ], @@ -5695,7 +5695,7 @@ "inputs": [ [ [ - 5.1875 + 4.3125 ] ] ], @@ -5711,12 +5711,12 @@ "inputs": [ [ [ - 5.875 + 4.875 ] ], [ [ - 6.65625 + 5.34375 ] ] ] @@ -5725,12 +5725,12 @@ "inputs": [ [ [ - 0.9609375 + 1.0 ] ], [ [ - 6.6875 + 4.46875 ] ] ] @@ -5739,14 +5739,14 @@ "inputs": [ [ [ - 139.0 + 80.0 ] ] ], "outputs": [ [ [ - 0.9609375 + 1.0 ] ] ] @@ -5755,7 +5755,7 @@ "inputs": [ [ [ - 9.8125 + 6.75 ] ] ], @@ -5803,7 +5803,7 @@ "inputs": [ [ [ - 20.625 + 25.125 ] ] ], @@ -5824,7 +5824,7 @@ ], [ [ - 5.25 + 4.0625 ] ] ] @@ -5847,7 +5847,7 @@ "inputs": [ [ [ - 2368.0 + 2112.0 ] ] ], @@ -5863,7 +5863,7 @@ "inputs": [ [ [ - 5.84375 + 4.96875 ] ] ], @@ -5879,7 +5879,7 @@ "inputs": [ [ [ - 107.0 + 24.5 ] ] ], @@ -5895,7 +5895,7 @@ "inputs": [ [ [ - 7.46875 + 5.3125 ] ] ], @@ -5911,7 +5911,7 @@ "inputs": [ [ [ - 7.46875 + 5.3125 ] ] ], @@ -5927,7 +5927,7 @@ "inputs": [ [ [ - 7.46875 + 5.3125 ] ] ], @@ -5943,7 +5943,7 @@ "inputs": [ [ [ - 7.09375 + 5.3125 ] ] ], @@ -5959,12 +5959,12 @@ "inputs": [ [ [ - 6.34375 + 5.09375 ] ], [ [ - 7.875 + 5.1875 ] ] ] @@ -5973,12 +5973,12 @@ "inputs": [ [ [ - 0.953125 + 1.0 ] ], [ [ - 7.40625 + 5.3125 ] ] ] @@ -5987,14 +5987,14 @@ "inputs": [ [ [ - 164.0 + 110.0 ] ] ], "outputs": [ [ [ - 0.953125 + 1.0 ] ] ] @@ -6003,7 +6003,7 @@ "inputs": [ [ [ - 8.5 + 5.375 ] ] ], @@ -6051,7 +6051,7 @@ "inputs": [ [ [ - 17.375 + 25.75 ] ] ], @@ -6072,7 +6072,7 @@ ], [ [ - 5.625 + 4.3125 ] ] ] @@ -6095,7 +6095,7 @@ "inputs": [ [ [ - 4160.0 + 3280.0 ] ] ], @@ -6111,7 +6111,7 @@ "inputs": [ [ [ - 6.125 + 4.75 ] ] ], @@ -6127,7 +6127,7 @@ "inputs": [ [ [ - 98.0 + 55.75 ] ] ], @@ -6143,7 +6143,7 @@ "inputs": [ [ [ - 7.375 + 4.6875 ] ] ], @@ -6159,7 +6159,7 @@ "inputs": [ [ [ - 7.375 + 4.6875 ] ] ], @@ -6175,7 +6175,7 @@ "inputs": [ [ [ - 7.375 + 4.6875 ] ] ], @@ -6191,7 +6191,7 @@ "inputs": [ [ [ - 6.28125 + 5.5 ] ] ], @@ -6207,12 +6207,12 @@ "inputs": [ [ [ - 6.625 + 5.40625 ] ], [ [ - 7.46875 + 5.28125 ] ] ] @@ -6226,7 +6226,7 @@ ], [ [ - 6.625 + 5.6875 ] ] ] @@ -6235,7 +6235,7 @@ "inputs": [ [ [ - 223.0 + 96.0 ] ] ], @@ -6251,7 +6251,7 @@ "inputs": [ [ [ - 7.90625 + 4.40625 ] ] ], @@ -6299,7 +6299,7 @@ "inputs": [ [ [ - 16.875 + 26.375 ] ] ], @@ -6320,7 +6320,7 @@ ], [ [ - 5.78125 + 4.375 ] ] ] @@ -6343,7 +6343,7 @@ "inputs": [ [ [ - 3680.0 + 2640.0 ] ] ], @@ -6359,7 +6359,7 @@ "inputs": [ [ [ - 6.0 + 4.375 ] ] ], @@ -6375,7 +6375,7 @@ "inputs": [ [ [ - 55.5 + 29.25 ] ] ], @@ -6391,7 +6391,7 @@ "inputs": [ [ [ - 28.75 + 19.875 ] ] ], @@ -6407,7 +6407,7 @@ "inputs": [ [ [ - 8.0 + 5.96875 ] ] ], @@ -6439,7 +6439,7 @@ "inputs": [ [ [ - 7.4375 + 7.34375 ] ] ], @@ -6455,7 +6455,7 @@ "inputs": [ [ [ - 84.5 + 100.0 ] ] ], @@ -6471,7 +6471,7 @@ "inputs": [ [ [ - 7.6875 + 5.125 ] ] ], @@ -6503,7 +6503,7 @@ "inputs": [ [ [ - 9.0625 + 8.375 ] ] ], @@ -6519,7 +6519,7 @@ "inputs": [ [ [ - 9.3125 + 7.09375 ] ] ], @@ -6535,7 +6535,7 @@ "inputs": [ [ [ - 4.25 + 3.859375 ] ] ], @@ -6551,7 +6551,7 @@ "inputs": [ [ [ - 4.25 + 3.859375 ] ] ], @@ -6567,7 +6567,7 @@ "inputs": [ [ [ - 4.25 + 3.859375 ] ] ], @@ -6583,7 +6583,7 @@ "inputs": [ [ [ - 6.34375 + 6.6875 ] ] ], @@ -6599,12 +6599,12 @@ "inputs": [ [ [ - 8.5625 + 8.625 ] ], [ [ - 9.6875 + 8.9375 ] ] ] @@ -6613,12 +6613,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 6.9375 + 6.71875 ] ] ] @@ -6627,14 +6627,14 @@ "inputs": [ [ [ - 372.0 + 304.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -6643,7 +6643,7 @@ "inputs": [ [ [ - 11.8125 + 6.71875 ] ] ], @@ -6691,7 +6691,7 @@ "inputs": [ [ [ - 5.0 + 4.34375 ] ] ], @@ -6712,7 +6712,7 @@ ], [ [ - 4.75 + 4.25 ] ] ] @@ -6735,7 +6735,7 @@ "inputs": [ [ [ - 732.0 + 804.0 ] ] ], @@ -6751,7 +6751,7 @@ "inputs": [ [ [ - 3.015625 + 2.6875 ] ] ], @@ -6767,7 +6767,7 @@ "inputs": [ [ [ - 31.625 + 26.375 ] ] ], @@ -6783,7 +6783,7 @@ "inputs": [ [ [ - 6.0 + 4.25 ] ] ], @@ -6799,7 +6799,7 @@ "inputs": [ [ [ - 6.0 + 4.25 ] ] ], @@ -6815,7 +6815,7 @@ "inputs": [ [ [ - 6.0 + 4.25 ] ] ], @@ -6831,7 +6831,7 @@ "inputs": [ [ [ - 4.375 + 6.84375 ] ] ], @@ -6847,12 +6847,12 @@ "inputs": [ [ [ - 8.25 + 7.5 ] ], [ [ - 10.875 + 7.4375 ] ] ] @@ -6861,12 +6861,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 6.0625 + 6.84375 ] ] ] @@ -6875,14 +6875,14 @@ "inputs": [ [ [ - 312.0 + 193.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -6891,7 +6891,7 @@ "inputs": [ [ [ - 13.75 + 7.78125 ] ] ], @@ -6939,7 +6939,7 @@ "inputs": [ [ [ - 5.40625 + 20.125 ] ] ], @@ -6960,7 +6960,7 @@ ], [ [ - 6.84375 + 5.40625 ] ] ] @@ -6983,7 +6983,7 @@ "inputs": [ [ [ - 864.0 + 844.0 ] ] ], @@ -6999,7 +6999,7 @@ "inputs": [ [ [ - 3.15625 + 3.3125 ] ] ], @@ -7015,7 +7015,7 @@ "inputs": [ [ [ - 22.75 + 24.0 ] ] ], @@ -7031,7 +7031,7 @@ "inputs": [ [ [ - 7.21875 + 4.90625 ] ] ], @@ -7047,7 +7047,7 @@ "inputs": [ [ [ - 7.21875 + 4.90625 ] ] ], @@ -7063,7 +7063,7 @@ "inputs": [ [ [ - 7.21875 + 4.90625 ] ] ], @@ -7079,7 +7079,7 @@ "inputs": [ [ [ - 6.125 + 7.1875 ] ] ], @@ -7095,12 +7095,12 @@ "inputs": [ [ [ - 7.28125 + 7.25 ] ], [ [ - 8.25 + 7.8125 ] ] ] @@ -7109,12 +7109,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 6.84375 + 7.1875 ] ] ] @@ -7123,14 +7123,14 @@ "inputs": [ [ [ - 236.0 + 160.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -7139,7 +7139,7 @@ "inputs": [ [ [ - 16.375 + 10.25 ] ] ], @@ -7187,7 +7187,7 @@ "inputs": [ [ [ - 5.53125 + 14.1875 ] ] ], @@ -7208,7 +7208,7 @@ ], [ [ - 7.125 + 5.9375 ] ] ] @@ -7231,7 +7231,7 @@ "inputs": [ [ [ - 840.0 + 868.0 ] ] ], @@ -7247,7 +7247,7 @@ "inputs": [ [ [ - 3.640625 + 3.703125 ] ] ], @@ -7263,7 +7263,7 @@ "inputs": [ [ [ - 34.25 + 25.75 ] ] ], @@ -7279,7 +7279,7 @@ "inputs": [ [ [ - 6.75 + 4.75 ] ] ], @@ -7295,7 +7295,7 @@ "inputs": [ [ [ - 6.75 + 4.75 ] ] ], @@ -7311,7 +7311,7 @@ "inputs": [ [ [ - 6.75 + 4.75 ] ] ], @@ -7327,7 +7327,7 @@ "inputs": [ [ [ - 6.09375 + 8.125 ] ] ], @@ -7343,12 +7343,12 @@ "inputs": [ [ [ - 8.375 + 7.46875 ] ], [ [ - 9.5 + 8.0625 ] ] ] @@ -7357,12 +7357,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 7.6875 + 8.125 ] ] ] @@ -7371,14 +7371,14 @@ "inputs": [ [ [ - 231.0 + 186.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -7387,7 +7387,7 @@ "inputs": [ [ [ - 18.5 + 13.125 ] ] ], @@ -7435,7 +7435,7 @@ "inputs": [ [ [ - 5.625 + 17.0 ] ] ], @@ -7456,7 +7456,7 @@ ], [ [ - 6.6875 + 5.125 ] ] ] @@ -7479,7 +7479,7 @@ "inputs": [ [ [ - 924.0 + 944.0 ] ] ], @@ -7495,7 +7495,7 @@ "inputs": [ [ [ - 3.875 + 4.15625 ] ] ], @@ -7511,7 +7511,7 @@ "inputs": [ [ [ - 44.0 + 25.5 ] ] ], @@ -7527,7 +7527,7 @@ "inputs": [ [ [ - 7.8125 + 5.40625 ] ] ], @@ -7543,7 +7543,7 @@ "inputs": [ [ [ - 7.8125 + 5.40625 ] ] ], @@ -7559,7 +7559,7 @@ "inputs": [ [ [ - 7.8125 + 5.40625 ] ] ], @@ -7575,7 +7575,7 @@ "inputs": [ [ [ - 5.6875 + 6.6875 ] ] ], @@ -7591,12 +7591,12 @@ "inputs": [ [ [ - 7.71875 + 7.1875 ] ], [ [ - 7.78125 + 6.84375 ] ] ] @@ -7605,12 +7605,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 7.15625 + 6.6875 ] ] ] @@ -7619,14 +7619,14 @@ "inputs": [ [ [ - 190.0 + 153.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -7635,7 +7635,7 @@ "inputs": [ [ [ - 19.125 + 15.375 ] ] ], @@ -7683,7 +7683,7 @@ "inputs": [ [ [ - 5.21875 + 16.375 ] ] ], @@ -7704,7 +7704,7 @@ ], [ [ - 4.96875 + 4.625 ] ] ] @@ -7727,7 +7727,7 @@ "inputs": [ [ [ - 888.0 + 940.0 ] ] ], @@ -7743,7 +7743,7 @@ "inputs": [ [ [ - 4.34375 + 4.25 ] ] ], @@ -7759,7 +7759,7 @@ "inputs": [ [ [ - 54.0 + 42.75 ] ] ], @@ -7775,7 +7775,7 @@ "inputs": [ [ [ - 7.84375 + 6.09375 ] ] ], @@ -7791,7 +7791,7 @@ "inputs": [ [ [ - 7.84375 + 6.09375 ] ] ], @@ -7807,7 +7807,7 @@ "inputs": [ [ [ - 7.84375 + 6.09375 ] ] ], @@ -7823,7 +7823,7 @@ "inputs": [ [ [ - 7.09375 + 6.65625 ] ] ], @@ -7839,12 +7839,12 @@ "inputs": [ [ [ - 7.625 + 6.625 ] ], [ [ - 7.59375 + 6.6875 ] ] ] @@ -7853,12 +7853,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 7.625 + 6.65625 ] ] ] @@ -7867,14 +7867,14 @@ "inputs": [ [ [ - 204.0 + 161.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -7883,7 +7883,7 @@ "inputs": [ [ [ - 20.0 + 16.625 ] ] ], @@ -7931,7 +7931,7 @@ "inputs": [ [ [ - 4.625 + 12.5625 ] ] ], @@ -7952,7 +7952,7 @@ ], [ [ - 4.625 + 4.5 ] ] ] @@ -7975,7 +7975,7 @@ "inputs": [ [ [ - 1168.0 + 1144.0 ] ] ], @@ -7991,7 +7991,7 @@ "inputs": [ [ [ - 4.40625 + 4.34375 ] ] ], @@ -8007,7 +8007,7 @@ "inputs": [ [ [ - 53.75 + 36.75 ] ] ], @@ -8023,7 +8023,7 @@ "inputs": [ [ [ - 8.5625 + 7.21875 ] ] ], @@ -8039,7 +8039,7 @@ "inputs": [ [ [ - 8.5625 + 7.21875 ] ] ], @@ -8055,7 +8055,7 @@ "inputs": [ [ [ - 8.5625 + 7.21875 ] ] ], @@ -8071,7 +8071,7 @@ "inputs": [ [ [ - 6.40625 + 6.25 ] ] ], @@ -8087,12 +8087,12 @@ "inputs": [ [ [ - 7.0 + 6.875 ] ], [ [ - 7.0 + 6.6875 ] ] ] @@ -8101,12 +8101,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 7.40625 + 6.25 ] ] ] @@ -8115,14 +8115,14 @@ "inputs": [ [ [ - 198.0 + 149.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -8131,7 +8131,7 @@ "inputs": [ [ [ - 20.125 + 17.25 ] ] ], @@ -8179,7 +8179,7 @@ "inputs": [ [ [ - 3.703125 + 15.0625 ] ] ], @@ -8200,7 +8200,7 @@ ], [ [ - 3.96875 + 3.84375 ] ] ] @@ -8223,7 +8223,7 @@ "inputs": [ [ [ - 2144.0 + 1976.0 ] ] ], @@ -8239,7 +8239,7 @@ "inputs": [ [ [ - 4.71875 + 4.46875 ] ] ], @@ -8255,7 +8255,7 @@ "inputs": [ [ [ - 56.75 + 38.0 ] ] ], @@ -8271,7 +8271,7 @@ "inputs": [ [ [ - 9.5625 + 8.125 ] ] ], @@ -8287,7 +8287,7 @@ "inputs": [ [ [ - 9.5625 + 8.125 ] ] ], @@ -8303,7 +8303,7 @@ "inputs": [ [ [ - 9.5625 + 8.125 ] ] ], @@ -8319,7 +8319,7 @@ "inputs": [ [ [ - 5.5625 + 6.78125 ] ] ], @@ -8335,12 +8335,12 @@ "inputs": [ [ [ - 7.25 + 7.15625 ] ], [ [ - 7.5625 + 7.53125 ] ] ] @@ -8349,12 +8349,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 6.65625 + 6.78125 ] ] ] @@ -8363,14 +8363,14 @@ "inputs": [ [ [ - 171.0 + 182.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -8379,7 +8379,7 @@ "inputs": [ [ [ - 20.5 + 17.625 ] ] ], @@ -8427,7 +8427,7 @@ "inputs": [ [ [ - 13.4375 + 12.4375 ] ] ], @@ -8448,7 +8448,7 @@ ], [ [ - 4.46875 + 3.875 ] ] ] @@ -8471,7 +8471,7 @@ "inputs": [ [ [ - 1784.0 + 1656.0 ] ] ], @@ -8487,7 +8487,7 @@ "inputs": [ [ [ - 4.625 + 4.96875 ] ] ], @@ -8503,7 +8503,7 @@ "inputs": [ [ [ - 51.25 + 35.75 ] ] ], @@ -8519,7 +8519,7 @@ "inputs": [ [ [ - 9.875 + 8.375 ] ] ], @@ -8535,7 +8535,7 @@ "inputs": [ [ [ - 9.875 + 8.375 ] ] ], @@ -8551,7 +8551,7 @@ "inputs": [ [ [ - 9.875 + 8.375 ] ] ], @@ -8567,7 +8567,7 @@ "inputs": [ [ [ - 6.78125 + 6.90625 ] ] ], @@ -8583,12 +8583,12 @@ "inputs": [ [ [ - 6.59375 + 6.34375 ] ], [ [ - 7.15625 + 6.34375 ] ] ] @@ -8597,12 +8597,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 8.8125 + 7.78125 ] ] ] @@ -8611,14 +8611,14 @@ "inputs": [ [ [ - 143.0 + 138.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -8627,7 +8627,7 @@ "inputs": [ [ [ - 19.625 + 17.25 ] ] ], @@ -8675,7 +8675,7 @@ "inputs": [ [ [ - 7.71875 + 23.375 ] ] ], @@ -8696,7 +8696,7 @@ ], [ [ - 5.46875 + 4.5 ] ] ] @@ -8719,7 +8719,7 @@ "inputs": [ [ [ - 5216.0 + 4768.0 ] ] ], @@ -8735,7 +8735,7 @@ "inputs": [ [ [ - 4.75 + 4.59375 ] ] ], @@ -8751,7 +8751,7 @@ "inputs": [ [ [ - 50.0 + 65.5 ] ] ], @@ -8767,7 +8767,7 @@ "inputs": [ [ [ - 10.625 + 9.1875 ] ] ], @@ -8783,7 +8783,7 @@ "inputs": [ [ [ - 10.625 + 9.1875 ] ] ], @@ -8799,7 +8799,7 @@ "inputs": [ [ [ - 10.625 + 9.1875 ] ] ], @@ -8815,7 +8815,7 @@ "inputs": [ [ [ - 6.125 + 7.34375 ] ] ], @@ -8831,12 +8831,12 @@ "inputs": [ [ [ - 6.71875 + 6.34375 ] ], [ [ - 7.125 + 6.65625 ] ] ] @@ -8845,12 +8845,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 7.1875 + 7.34375 ] ] ] @@ -8859,14 +8859,14 @@ "inputs": [ [ [ - 151.0 + 141.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -8875,7 +8875,7 @@ "inputs": [ [ [ - 23.5 + 21.125 ] ] ], @@ -8923,7 +8923,7 @@ "inputs": [ [ [ - 5.5 + 31.125 ] ] ], @@ -8944,7 +8944,7 @@ ], [ [ - 4.78125 + 4.84375 ] ] ] @@ -8967,7 +8967,7 @@ "inputs": [ [ [ - 3392.0 + 2928.0 ] ] ], @@ -8983,7 +8983,7 @@ "inputs": [ [ [ - 4.875 + 4.53125 ] ] ], @@ -8999,7 +8999,7 @@ "inputs": [ [ [ - 46.0 + 61.0 ] ] ], @@ -9015,7 +9015,7 @@ "inputs": [ [ [ - 162.0 + 146.0 ] ] ], @@ -9031,7 +9031,7 @@ "inputs": [ [ [ - 8.0625 + 6.53125 ] ] ], @@ -9047,7 +9047,7 @@ "inputs": [ [ [ - 5.15625 + 4.625 ] ] ], @@ -9063,7 +9063,7 @@ "inputs": [ [ [ - 5.15625 + 4.625 ] ] ], @@ -9079,7 +9079,7 @@ "inputs": [ [ [ - 5.15625 + 4.625 ] ] ], @@ -9095,7 +9095,7 @@ "inputs": [ [ [ - 4.375 + 6.0 ] ] ], @@ -9111,12 +9111,12 @@ "inputs": [ [ [ - 7.125 + 6.28125 ] ], [ [ - 9.6875 + 6.34375 ] ] ] @@ -9125,12 +9125,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 6.6875 + 6.1875 ] ] ] @@ -9139,14 +9139,14 @@ "inputs": [ [ [ - 228.0 + 136.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -9155,7 +9155,7 @@ "inputs": [ [ [ - 12.0625 + 7.65625 ] ] ], @@ -9203,7 +9203,7 @@ "inputs": [ [ [ - 2.90625 + 4.4375 ] ] ], @@ -9224,7 +9224,7 @@ ], [ [ - 4.59375 + 4.21875 ] ] ] @@ -9233,7 +9233,7 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ @@ -9247,14 +9247,14 @@ "inputs": [ [ [ - 648.0 + 560.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -9263,7 +9263,7 @@ "inputs": [ [ [ - 5.59375 + 4.09375 ] ] ], @@ -9279,7 +9279,7 @@ "inputs": [ [ [ - 22.75 + 25.125 ] ] ], @@ -9295,7 +9295,7 @@ "inputs": [ [ [ - 6.3125 + 4.09375 ] ] ], @@ -9311,7 +9311,7 @@ "inputs": [ [ [ - 6.3125 + 4.09375 ] ] ], @@ -9327,7 +9327,7 @@ "inputs": [ [ [ - 6.3125 + 4.09375 ] ] ], @@ -9343,7 +9343,7 @@ "inputs": [ [ [ - 4.71875 + 5.21875 ] ] ], @@ -9359,12 +9359,12 @@ "inputs": [ [ [ - 7.5625 + 5.78125 ] ], [ [ - 8.4375 + 6.03125 ] ] ] @@ -9373,12 +9373,12 @@ "inputs": [ [ [ - 0.9765625 + 1.0 ] ], [ [ - 6.75 + 6.0 ] ] ] @@ -9387,14 +9387,14 @@ "inputs": [ [ [ - 241.0 + 113.0 ] ] ], "outputs": [ [ [ - 0.9765625 + 1.0 ] ] ] @@ -9403,7 +9403,7 @@ "inputs": [ [ [ - 13.9375 + 7.40625 ] ] ], @@ -9451,7 +9451,7 @@ "inputs": [ [ [ - 4.84375 + 15.5 ] ] ], @@ -9472,7 +9472,7 @@ ], [ [ - 6.75 + 5.46875 ] ] ] @@ -9495,7 +9495,7 @@ "inputs": [ [ [ - 684.0 + 512.0 ] ] ], @@ -9511,7 +9511,7 @@ "inputs": [ [ [ - 7.0 + 3.84375 ] ] ], @@ -9527,7 +9527,7 @@ "inputs": [ [ [ - 23.0 + 58.25 ] ] ], @@ -9543,7 +9543,7 @@ "inputs": [ [ [ - 8.3125 + 4.71875 ] ] ], @@ -9559,7 +9559,7 @@ "inputs": [ [ [ - 8.3125 + 4.71875 ] ] ], @@ -9575,7 +9575,7 @@ "inputs": [ [ [ - 8.3125 + 4.71875 ] ] ], @@ -9591,7 +9591,7 @@ "inputs": [ [ [ - 4.875 + 4.5625 ] ] ], @@ -9607,12 +9607,12 @@ "inputs": [ [ [ - 7.03125 + 5.15625 ] ], [ [ - 7.75 + 5.3125 ] ] ] @@ -9621,12 +9621,12 @@ "inputs": [ [ [ - 0.98046875 + 1.0 ] ], [ [ - 7.0 + 5.28125 ] ] ] @@ -9635,14 +9635,14 @@ "inputs": [ [ [ - 184.0 + 85.0 ] ] ], "outputs": [ [ [ - 0.98046875 + 1.0 ] ] ] @@ -9651,7 +9651,7 @@ "inputs": [ [ [ - 15.8125 + 8.8125 ] ] ], @@ -9699,7 +9699,7 @@ "inputs": [ [ [ - 5.53125 + 14.8125 ] ] ], @@ -9720,7 +9720,7 @@ ], [ [ - 6.34375 + 4.59375 ] ] ] @@ -9743,7 +9743,7 @@ "inputs": [ [ [ - 904.0 + 772.0 ] ] ], @@ -9759,7 +9759,7 @@ "inputs": [ [ [ - 6.71875 + 3.859375 ] ] ], @@ -9775,7 +9775,7 @@ "inputs": [ [ [ - 45.0 + 102.5 ] ] ], @@ -9791,7 +9791,7 @@ "inputs": [ [ [ - 8.375 + 4.75 ] ] ], @@ -9807,7 +9807,7 @@ "inputs": [ [ [ - 8.375 + 4.75 ] ] ], @@ -9823,7 +9823,7 @@ "inputs": [ [ [ - 8.375 + 4.75 ] ] ], @@ -9839,7 +9839,7 @@ "inputs": [ [ [ - 4.90625 + 5.5625 ] ] ], @@ -9855,12 +9855,12 @@ "inputs": [ [ [ - 7.9375 + 6.78125 ] ], [ [ - 9.0625 + 5.90625 ] ] ] @@ -9869,12 +9869,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 7.3125 + 6.21875 ] ] ] @@ -9883,14 +9883,14 @@ "inputs": [ [ [ - 182.0 + 118.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -9899,7 +9899,7 @@ "inputs": [ [ [ - 15.9375 + 8.875 ] ] ], @@ -9947,7 +9947,7 @@ "inputs": [ [ [ - 7.3125 + 13.875 ] ] ], @@ -9968,7 +9968,7 @@ ], [ [ - 6.65625 + 5.28125 ] ] ] @@ -9991,7 +9991,7 @@ "inputs": [ [ [ - 1360.0 + 1392.0 ] ] ], @@ -10007,7 +10007,7 @@ "inputs": [ [ [ - 6.84375 + 4.21875 ] ] ], @@ -10023,7 +10023,7 @@ "inputs": [ [ [ - 86.5 + 105.5 ] ] ], @@ -10039,7 +10039,7 @@ "inputs": [ [ [ - 8.125 + 4.90625 ] ] ], @@ -10055,7 +10055,7 @@ "inputs": [ [ [ - 8.125 + 4.90625 ] ] ], @@ -10071,7 +10071,7 @@ "inputs": [ [ [ - 8.125 + 4.90625 ] ] ], @@ -10087,7 +10087,7 @@ "inputs": [ [ [ - 6.0 + 5.59375 ] ] ], @@ -10103,12 +10103,12 @@ "inputs": [ [ [ - 7.3125 + 5.40625 ] ], [ [ - 8.375 + 5.90625 ] ] ] @@ -10117,12 +10117,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 7.625 + 5.625 ] ] ] @@ -10131,14 +10131,14 @@ "inputs": [ [ [ - 173.0 + 100.5 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -10147,7 +10147,7 @@ "inputs": [ [ [ - 17.75 + 11.5 ] ] ], @@ -10195,7 +10195,7 @@ "inputs": [ [ [ - 5.78125 + 10.375 ] ] ], @@ -10216,7 +10216,7 @@ ], [ [ - 5.40625 + 4.09375 ] ] ] @@ -10239,7 +10239,7 @@ "inputs": [ [ [ - 1072.0 + 880.0 ] ] ], @@ -10255,7 +10255,7 @@ "inputs": [ [ [ - 6.875 + 4.5 ] ] ], @@ -10271,7 +10271,7 @@ "inputs": [ [ [ - 39.75 + 38.25 ] ] ], @@ -10287,7 +10287,7 @@ "inputs": [ [ [ - 8.75 + 5.59375 ] ] ], @@ -10303,7 +10303,7 @@ "inputs": [ [ [ - 8.75 + 5.59375 ] ] ], @@ -10319,7 +10319,7 @@ "inputs": [ [ [ - 8.75 + 5.59375 ] ] ], @@ -10335,7 +10335,7 @@ "inputs": [ [ [ - 5.0 + 6.0 ] ] ], @@ -10351,12 +10351,12 @@ "inputs": [ [ [ - 7.3125 + 5.75 ] ], [ [ - 7.3125 + 5.96875 ] ] ] @@ -10365,12 +10365,12 @@ "inputs": [ [ [ - 0.9765625 + 1.0 ] ], [ [ - 7.65625 + 6.0 ] ] ] @@ -10379,14 +10379,14 @@ "inputs": [ [ [ - 194.0 + 115.5 ] ] ], "outputs": [ [ [ - 0.9765625 + 1.0 ] ] ] @@ -10395,7 +10395,7 @@ "inputs": [ [ [ - 17.375 + 11.375 ] ] ], @@ -10443,7 +10443,7 @@ "inputs": [ [ [ - 7.1875 + 13.9375 ] ] ], @@ -10487,7 +10487,7 @@ "inputs": [ [ [ - 1816.0 + 1392.0 ] ] ], @@ -10503,7 +10503,7 @@ "inputs": [ [ [ - 6.84375 + 4.75 ] ] ], @@ -10519,7 +10519,7 @@ "inputs": [ [ [ - 43.5 + 35.75 ] ] ], @@ -10535,7 +10535,7 @@ "inputs": [ [ [ - 10.4375 + 6.625 ] ] ], @@ -10551,7 +10551,7 @@ "inputs": [ [ [ - 10.4375 + 6.625 ] ] ], @@ -10567,7 +10567,7 @@ "inputs": [ [ [ - 10.4375 + 6.625 ] ] ], @@ -10583,7 +10583,7 @@ "inputs": [ [ [ - 7.3125 + 6.6875 ] ] ], @@ -10599,12 +10599,12 @@ "inputs": [ [ [ - 7.09375 + 5.9375 ] ], [ [ - 8.5625 + 6.34375 ] ] ] @@ -10613,12 +10613,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 8.5625 + 6.6875 ] ] ] @@ -10627,14 +10627,14 @@ "inputs": [ [ [ - 204.0 + 96.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -10643,7 +10643,7 @@ "inputs": [ [ [ - 17.875 + 11.625 ] ] ], @@ -10691,7 +10691,7 @@ "inputs": [ [ [ - 4.09375 + 13.25 ] ] ], @@ -10712,7 +10712,7 @@ ], [ [ - 5.40625 + 4.46875 ] ] ] @@ -10735,7 +10735,7 @@ "inputs": [ [ [ - 2320.0 + 1944.0 ] ] ], @@ -10751,7 +10751,7 @@ "inputs": [ [ [ - 6.71875 + 5.0625 ] ] ], @@ -10767,7 +10767,7 @@ "inputs": [ [ [ - 53.0 + 58.5 ] ] ], @@ -10783,7 +10783,7 @@ "inputs": [ [ [ - 9.0 + 5.78125 ] ] ], @@ -10799,7 +10799,7 @@ "inputs": [ [ [ - 9.0 + 5.78125 ] ] ], @@ -10815,7 +10815,7 @@ "inputs": [ [ [ - 9.0 + 5.78125 ] ] ], @@ -10831,7 +10831,7 @@ "inputs": [ [ [ - 6.0 + 7.4375 ] ] ], @@ -10847,12 +10847,12 @@ "inputs": [ [ [ - 6.875 + 5.78125 ] ], [ [ - 8.1875 + 5.90625 ] ] ] @@ -10861,12 +10861,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 7.6875 + 7.4375 ] ] ] @@ -10875,14 +10875,14 @@ "inputs": [ [ [ - 211.0 + 116.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -10891,7 +10891,7 @@ "inputs": [ [ [ - 16.125 + 10.5 ] ] ], @@ -10939,7 +10939,7 @@ "inputs": [ [ [ - 5.625 + 15.9375 ] ] ], @@ -10960,7 +10960,7 @@ ], [ [ - 5.28125 + 3.59375 ] ] ] @@ -10983,7 +10983,7 @@ "inputs": [ [ [ - 2640.0 + 2272.0 ] ] ], @@ -10999,7 +10999,7 @@ "inputs": [ [ [ - 6.59375 + 4.65625 ] ] ], @@ -11015,7 +11015,7 @@ "inputs": [ [ [ - 56.75 + 58.5 ] ] ], @@ -11031,7 +11031,7 @@ "inputs": [ [ [ - 10.8125 + 7.03125 ] ] ], @@ -11047,7 +11047,7 @@ "inputs": [ [ [ - 10.8125 + 7.03125 ] ] ], @@ -11063,7 +11063,7 @@ "inputs": [ [ [ - 10.8125 + 7.03125 ] ] ], @@ -11079,7 +11079,7 @@ "inputs": [ [ [ - 7.8125 + 6.71875 ] ] ], @@ -11095,12 +11095,12 @@ "inputs": [ [ [ - 6.9375 + 6.78125 ] ], [ [ - 7.875 + 5.84375 ] ] ] @@ -11114,7 +11114,7 @@ ], [ [ - 9.125 + 7.46875 ] ] ] @@ -11123,7 +11123,7 @@ "inputs": [ [ [ - 201.0 + 108.5 ] ] ], @@ -11139,7 +11139,7 @@ "inputs": [ [ [ - 16.25 + 10.75 ] ] ], @@ -11187,7 +11187,7 @@ "inputs": [ [ [ - 6.03125 + 16.375 ] ] ], @@ -11208,7 +11208,7 @@ ], [ [ - 4.5 + 4.0 ] ] ] @@ -11231,7 +11231,7 @@ "inputs": [ [ [ - 2944.0 + 2480.0 ] ] ], @@ -11247,7 +11247,7 @@ "inputs": [ [ [ - 6.09375 + 4.8125 ] ] ], @@ -11263,7 +11263,7 @@ "inputs": [ [ [ - 55.0 + 60.75 ] ] ], @@ -11279,7 +11279,7 @@ "inputs": [ [ [ - 11.625 + 7.375 ] ] ], @@ -11295,7 +11295,7 @@ "inputs": [ [ [ - 11.625 + 7.375 ] ] ], @@ -11311,7 +11311,7 @@ "inputs": [ [ [ - 11.625 + 7.375 ] ] ], @@ -11327,7 +11327,7 @@ "inputs": [ [ [ - 5.5 + 7.90625 ] ] ], @@ -11343,12 +11343,12 @@ "inputs": [ [ [ - 6.625 + 5.90625 ] ], [ [ - 7.125 + 6.15625 ] ] ] @@ -11357,12 +11357,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 9.125 + 8.8125 ] ] ] @@ -11371,14 +11371,14 @@ "inputs": [ [ [ - 136.0 + 84.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -11387,7 +11387,7 @@ "inputs": [ [ [ - 17.75 + 12.125 ] ] ], @@ -11435,7 +11435,7 @@ "inputs": [ [ [ - 5.15625 + 4.4375 ] ] ], @@ -11456,7 +11456,7 @@ ], [ [ - 4.78125 + 5.125 ] ] ] @@ -11479,7 +11479,7 @@ "inputs": [ [ [ - 2336.0 + 2016.0 ] ] ], @@ -11495,7 +11495,7 @@ "inputs": [ [ [ - 5.09375 + 4.34375 ] ] ], @@ -11511,7 +11511,7 @@ "inputs": [ [ [ - 43.5 + 50.0 ] ] ], @@ -11527,7 +11527,7 @@ "inputs": [ [ [ - 120.0 + 74.5 ] ] ], @@ -11543,7 +11543,7 @@ "inputs": [ [ [ - 12.4375 + 6.84375 ] ] ], @@ -11559,7 +11559,7 @@ "inputs": [ [ [ - 4.15625 + 3.453125 ] ] ], @@ -11575,7 +11575,7 @@ "inputs": [ [ [ - 4.15625 + 3.453125 ] ] ], @@ -11591,7 +11591,7 @@ "inputs": [ [ [ - 4.15625 + 3.453125 ] ] ], @@ -11607,7 +11607,7 @@ "inputs": [ [ [ - 4.8125 + 4.5 ] ] ], @@ -11623,12 +11623,12 @@ "inputs": [ [ [ - 7.59375 + 7.4375 ] ], [ [ - 8.875 + 7.875 ] ] ] @@ -11642,7 +11642,7 @@ ], [ [ - 5.0625 + 4.8125 ] ] ] @@ -11651,7 +11651,7 @@ "inputs": [ [ [ - 208.0 + 185.0 ] ] ], @@ -11667,7 +11667,7 @@ "inputs": [ [ [ - 10.6875 + 9.4375 ] ] ], @@ -11715,7 +11715,7 @@ "inputs": [ [ [ - 2.421875 + 2.734375 ] ] ], @@ -11736,7 +11736,7 @@ ], [ [ - 4.75 + 4.59375 ] ] ] @@ -11745,7 +11745,7 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ @@ -11759,14 +11759,14 @@ "inputs": [ [ [ - 1080.0 + 952.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -11775,7 +11775,7 @@ "inputs": [ [ [ - 3.84375 + 3.125 ] ] ], @@ -11791,7 +11791,7 @@ "inputs": [ [ [ - 16.125 + 22.25 ] ] ], @@ -11807,7 +11807,7 @@ "inputs": [ [ [ - 4.6875 + 3.671875 ] ] ], @@ -11823,7 +11823,7 @@ "inputs": [ [ [ - 4.6875 + 3.671875 ] ] ], @@ -11839,7 +11839,7 @@ "inputs": [ [ [ - 4.6875 + 3.671875 ] ] ], @@ -11855,7 +11855,7 @@ "inputs": [ [ [ - 4.84375 + 5.09375 ] ] ], @@ -11871,12 +11871,12 @@ "inputs": [ [ [ - 7.34375 + 7.0 ] ], [ [ - 8.4375 + 7.46875 ] ] ] @@ -11885,12 +11885,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 5.90625 + 5.125 ] ] ] @@ -11899,14 +11899,14 @@ "inputs": [ [ [ - 227.0 + 140.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -11915,7 +11915,7 @@ "inputs": [ [ [ - 10.6875 + 9.5 ] ] ], @@ -11963,7 +11963,7 @@ "inputs": [ [ [ - 2.71875 + 12.5 ] ] ], @@ -11984,7 +11984,7 @@ ], [ [ - 6.34375 + 6.3125 ] ] ] @@ -12007,7 +12007,7 @@ "inputs": [ [ [ - 1056.0 + 916.0 ] ] ], @@ -12023,7 +12023,7 @@ "inputs": [ [ [ - 4.5625 + 3.234375 ] ] ], @@ -12039,7 +12039,7 @@ "inputs": [ [ [ - 23.125 + 24.875 ] ] ], @@ -12055,7 +12055,7 @@ "inputs": [ [ [ - 5.28125 + 4.21875 ] ] ], @@ -12071,7 +12071,7 @@ "inputs": [ [ [ - 5.28125 + 4.21875 ] ] ], @@ -12087,7 +12087,7 @@ "inputs": [ [ [ - 5.28125 + 4.21875 ] ] ], @@ -12103,7 +12103,7 @@ "inputs": [ [ [ - 4.25 + 4.59375 ] ] ], @@ -12119,12 +12119,12 @@ "inputs": [ [ [ - 7.84375 + 7.5 ] ], [ [ - 8.9375 + 7.53125 ] ] ] @@ -12133,12 +12133,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 6.78125 + 4.90625 ] ] ] @@ -12147,14 +12147,14 @@ "inputs": [ [ [ - 227.0 + 156.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -12163,7 +12163,7 @@ "inputs": [ [ [ - 11.3125 + 9.4375 ] ] ], @@ -12211,7 +12211,7 @@ "inputs": [ [ [ - 3.703125 + 13.0 ] ] ], @@ -12232,7 +12232,7 @@ ], [ [ - 4.9375 + 5.65625 ] ] ] @@ -12255,7 +12255,7 @@ "inputs": [ [ [ - 676.0 + 748.0 ] ] ], @@ -12271,7 +12271,7 @@ "inputs": [ [ [ - 4.5 + 3.46875 ] ] ], @@ -12287,7 +12287,7 @@ "inputs": [ [ [ - 35.0 + 16.75 ] ] ], @@ -12303,7 +12303,7 @@ "inputs": [ [ [ - 5.75 + 4.71875 ] ] ], @@ -12319,7 +12319,7 @@ "inputs": [ [ [ - 5.75 + 4.71875 ] ] ], @@ -12335,7 +12335,7 @@ "inputs": [ [ [ - 5.75 + 4.71875 ] ] ], @@ -12351,7 +12351,7 @@ "inputs": [ [ [ - 4.3125 + 4.78125 ] ] ], @@ -12367,12 +12367,12 @@ "inputs": [ [ [ - 7.625 + 6.84375 ] ], [ [ - 8.25 + 7.125 ] ] ] @@ -12381,12 +12381,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 5.90625 + 4.875 ] ] ] @@ -12395,14 +12395,14 @@ "inputs": [ [ [ - 189.0 + 147.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -12411,7 +12411,7 @@ "inputs": [ [ [ - 12.6875 + 10.1875 ] ] ], @@ -12459,7 +12459,7 @@ "inputs": [ [ [ - 3.203125 + 11.1875 ] ] ], @@ -12480,7 +12480,7 @@ ], [ [ - 4.75 + 4.375 ] ] ] @@ -12503,7 +12503,7 @@ "inputs": [ [ [ - 924.0 + 1004.0 ] ] ], @@ -12519,7 +12519,7 @@ "inputs": [ [ [ - 4.78125 + 3.671875 ] ] ], @@ -12535,7 +12535,7 @@ "inputs": [ [ [ - 37.25 + 20.5 ] ] ], @@ -12551,7 +12551,7 @@ "inputs": [ [ [ - 6.25 + 4.9375 ] ] ], @@ -12567,7 +12567,7 @@ "inputs": [ [ [ - 6.25 + 4.9375 ] ] ], @@ -12583,7 +12583,7 @@ "inputs": [ [ [ - 6.25 + 4.9375 ] ] ], @@ -12599,7 +12599,7 @@ "inputs": [ [ [ - 5.0625 + 6.40625 ] ] ], @@ -12615,12 +12615,12 @@ "inputs": [ [ [ - 7.09375 + 6.6875 ] ], [ [ - 7.59375 + 7.53125 ] ] ] @@ -12629,12 +12629,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 6.1875 + 6.4375 ] ] ] @@ -12643,14 +12643,14 @@ "inputs": [ [ [ - 163.0 + 140.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -12659,7 +12659,7 @@ "inputs": [ [ [ - 13.75 + 11.5 ] ] ], @@ -12707,7 +12707,7 @@ "inputs": [ [ [ - 3.109375 + 11.4375 ] ] ], @@ -12728,7 +12728,7 @@ ], [ [ - 4.71875 + 3.75 ] ] ] @@ -12751,7 +12751,7 @@ "inputs": [ [ [ - 648.0 + 600.0 ] ] ], @@ -12767,7 +12767,7 @@ "inputs": [ [ [ - 4.84375 + 3.859375 ] ] ], @@ -12783,7 +12783,7 @@ "inputs": [ [ [ - 52.25 + 19.0 ] ] ], @@ -12799,7 +12799,7 @@ "inputs": [ [ [ - 6.75 + 5.65625 ] ] ], @@ -12815,7 +12815,7 @@ "inputs": [ [ [ - 6.75 + 5.65625 ] ] ], @@ -12831,7 +12831,7 @@ "inputs": [ [ [ - 6.75 + 5.65625 ] ] ], @@ -12847,7 +12847,7 @@ "inputs": [ [ [ - 5.0625 + 5.5625 ] ] ], @@ -12863,12 +12863,12 @@ "inputs": [ [ [ - 6.71875 + 5.84375 ] ], [ [ - 7.6875 + 6.65625 ] ] ] @@ -12877,12 +12877,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 5.6875 + 5.5625 ] ] ] @@ -12891,14 +12891,14 @@ "inputs": [ [ [ - 187.0 + 130.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -12907,7 +12907,7 @@ "inputs": [ [ [ - 14.0 + 12.1875 ] ] ], @@ -12955,7 +12955,7 @@ "inputs": [ [ [ - 3.234375 + 10.875 ] ] ], @@ -12976,7 +12976,7 @@ ], [ [ - 4.21875 + 3.796875 ] ] ] @@ -12999,7 +12999,7 @@ "inputs": [ [ [ - 828.0 + 832.0 ] ] ], @@ -13015,7 +13015,7 @@ "inputs": [ [ [ - 4.78125 + 3.703125 ] ] ], @@ -13031,7 +13031,7 @@ "inputs": [ [ [ - 28.5 + 15.875 ] ] ], @@ -13047,7 +13047,7 @@ "inputs": [ [ [ - 6.40625 + 5.40625 ] ] ], @@ -13063,7 +13063,7 @@ "inputs": [ [ [ - 6.40625 + 5.40625 ] ] ], @@ -13079,7 +13079,7 @@ "inputs": [ [ [ - 6.40625 + 5.40625 ] ] ], @@ -13095,7 +13095,7 @@ "inputs": [ [ [ - 4.375 + 4.5 ] ] ], @@ -13111,12 +13111,12 @@ "inputs": [ [ [ - 6.3125 + 6.53125 ] ], [ [ - 6.625 + 6.46875 ] ] ] @@ -13130,7 +13130,7 @@ ], [ [ - 5.625 + 4.875 ] ] ] @@ -13139,7 +13139,7 @@ "inputs": [ [ [ - 154.0 + 131.0 ] ] ], @@ -13155,7 +13155,7 @@ "inputs": [ [ [ - 14.1875 + 12.5 ] ] ], @@ -13203,7 +13203,7 @@ "inputs": [ [ [ - 2.453125 + 13.4375 ] ] ], @@ -13224,7 +13224,7 @@ ], [ [ - 4.125 + 3.21875 ] ] ] @@ -13247,7 +13247,7 @@ "inputs": [ [ [ - 1480.0 + 1248.0 ] ] ], @@ -13263,7 +13263,7 @@ "inputs": [ [ [ - 4.84375 + 3.734375 ] ] ], @@ -13279,7 +13279,7 @@ "inputs": [ [ [ - 50.0 + 22.5 ] ] ], @@ -13295,7 +13295,7 @@ "inputs": [ [ [ - 7.125 + 6.09375 ] ] ], @@ -13311,7 +13311,7 @@ "inputs": [ [ [ - 7.125 + 6.09375 ] ] ], @@ -13327,7 +13327,7 @@ "inputs": [ [ [ - 7.125 + 6.09375 ] ] ], @@ -13343,7 +13343,7 @@ "inputs": [ [ [ - 6.03125 + 5.375 ] ] ], @@ -13359,12 +13359,12 @@ "inputs": [ [ [ - 7.09375 + 8.0 ] ], [ [ - 8.25 + 7.6875 ] ] ] @@ -13378,7 +13378,7 @@ ], [ [ - 9.0625 + 5.5625 ] ] ] @@ -13387,7 +13387,7 @@ "inputs": [ [ [ - 177.0 + 161.0 ] ] ], @@ -13403,7 +13403,7 @@ "inputs": [ [ [ - 14.625 + 13.4375 ] ] ], @@ -13451,7 +13451,7 @@ "inputs": [ [ [ - 2.125 + 10.75 ] ] ], @@ -13472,7 +13472,7 @@ ], [ [ - 3.59375 + 3.25 ] ] ] @@ -13495,7 +13495,7 @@ "inputs": [ [ [ - 1536.0 + 1424.0 ] ] ], @@ -13511,7 +13511,7 @@ "inputs": [ [ [ - 5.0625 + 3.828125 ] ] ], @@ -13527,7 +13527,7 @@ "inputs": [ [ [ - 43.0 + 30.5 ] ] ], @@ -13543,7 +13543,7 @@ "inputs": [ [ [ - 8.0 + 7.03125 ] ] ], @@ -13559,7 +13559,7 @@ "inputs": [ [ [ - 8.0 + 7.03125 ] ] ], @@ -13575,7 +13575,7 @@ "inputs": [ [ [ - 8.0 + 7.03125 ] ] ], @@ -13591,7 +13591,7 @@ "inputs": [ [ [ - 6.65625 + 6.40625 ] ] ], @@ -13607,12 +13607,12 @@ "inputs": [ [ [ - 6.90625 + 7.28125 ] ], [ [ - 8.4375 + 7.96875 ] ] ] @@ -13621,12 +13621,12 @@ "inputs": [ [ [ - 0.98828125 + 1.0 ] ], [ [ - 10.6875 + 6.4375 ] ] ] @@ -13635,14 +13635,14 @@ "inputs": [ [ [ - 211.0 + 190.0 ] ] ], "outputs": [ [ [ - 0.98828125 + 1.0 ] ] ] @@ -13651,7 +13651,7 @@ "inputs": [ [ [ - 17.125 + 15.9375 ] ] ], @@ -13699,7 +13699,7 @@ "inputs": [ [ [ - 4.28125 + 5.125 ] ] ], @@ -13720,7 +13720,7 @@ ], [ [ - 3.8125 + 3.4375 ] ] ] @@ -13729,7 +13729,7 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ @@ -13743,14 +13743,14 @@ "inputs": [ [ [ - 1952.0 + 1584.0 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -13759,7 +13759,7 @@ "inputs": [ [ [ - 5.4375 + 3.703125 ] ] ], @@ -13775,7 +13775,7 @@ "inputs": [ [ [ - 42.25 + 25.5 ] ] ], @@ -13791,7 +13791,7 @@ "inputs": [ [ [ - 7.0625 + 6.4375 ] ] ], @@ -13807,7 +13807,7 @@ "inputs": [ [ [ - 7.0625 + 6.4375 ] ] ], @@ -13823,7 +13823,7 @@ "inputs": [ [ [ - 7.0625 + 6.4375 ] ] ], @@ -13839,7 +13839,7 @@ "inputs": [ [ [ - 4.78125 + 5.5625 ] ] ], @@ -13855,12 +13855,12 @@ "inputs": [ [ [ - 6.96875 + 7.34375 ] ], [ [ - 7.125 + 6.84375 ] ] ] @@ -13869,12 +13869,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 5.78125 + 6.0 ] ] ] @@ -13883,14 +13883,14 @@ "inputs": [ [ [ - 188.0 + 132.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -13899,7 +13899,7 @@ "inputs": [ [ [ - 19.625 + 18.25 ] ] ], @@ -13947,7 +13947,7 @@ "inputs": [ [ [ - 4.40625 + 4.53125 ] ] ], @@ -13968,7 +13968,7 @@ ], [ [ - 4.28125 + 4.21875 ] ] ] @@ -13977,7 +13977,7 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ @@ -13991,14 +13991,14 @@ "inputs": [ [ [ - 3088.0 + 2816.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -14007,7 +14007,7 @@ "inputs": [ [ [ - 5.8125 + 4.65625 ] ] ], @@ -14023,7 +14023,7 @@ "inputs": [ [ [ - 43.75 + 26.875 ] ] ], @@ -14039,7 +14039,7 @@ "inputs": [ [ [ - 174.0 + 121.0 ] ] ], @@ -14055,7 +14055,7 @@ "inputs": [ [ [ - 10.0 + 6.75 ] ] ], @@ -14087,7 +14087,7 @@ "inputs": [ [ [ - 13.75 + 10.0625 ] ] ], @@ -14103,7 +14103,7 @@ "inputs": [ [ [ - 59.5 + 82.5 ] ] ], @@ -14119,7 +14119,7 @@ "inputs": [ [ [ - 11.75 + 6.96875 ] ] ], @@ -14151,7 +14151,7 @@ "inputs": [ [ [ - 11.875 + 9.0 ] ] ], @@ -14167,7 +14167,7 @@ "inputs": [ [ [ - 94.5 + 84.5 ] ] ], @@ -14183,7 +14183,7 @@ "inputs": [ [ [ - 9.3125 + 4.8125 ] ] ], @@ -14215,7 +14215,7 @@ "inputs": [ [ [ - 8.5625 + 6.53125 ] ] ], @@ -14231,7 +14231,7 @@ "inputs": [ [ [ - 94.0 + 100.0 ] ] ], @@ -14247,7 +14247,7 @@ "inputs": [ [ [ - 67.5 + 55.5 ] ] ], @@ -14263,7 +14263,7 @@ "inputs": [ [ [ - 6.9375 + 5.21875 ] ] ], @@ -14279,7 +14279,7 @@ "inputs": [ [ [ - 7.96875 + 6.125 ] ] ], @@ -14295,7 +14295,7 @@ "inputs": [ [ [ - 7.96875 + 6.125 ] ] ], @@ -14311,7 +14311,7 @@ "inputs": [ [ [ - 7.96875 + 6.125 ] ] ], @@ -14327,7 +14327,7 @@ "inputs": [ [ [ - 5.78125 + 7.4375 ] ] ], @@ -14343,12 +14343,12 @@ "inputs": [ [ [ - 9.125 + 10.0 ] ], [ [ - 10.125 + 9.125 ] ] ] @@ -14357,12 +14357,12 @@ "inputs": [ [ [ - 0.98046875 + 1.0 ] ], [ [ - 7.25 + 7.53125 ] ] ] @@ -14371,14 +14371,14 @@ "inputs": [ [ [ - 268.0 + 202.0 ] ] ], "outputs": [ [ [ - 0.98046875 + 1.0 ] ] ] @@ -14387,7 +14387,7 @@ "inputs": [ [ [ - 14.3125 + 11.9375 ] ] ], @@ -14435,7 +14435,7 @@ "inputs": [ [ [ - 3.59375 + 16.875 ] ] ], @@ -14456,7 +14456,7 @@ ], [ [ - 4.1875 + 4.21875 ] ] ] @@ -14479,7 +14479,7 @@ "inputs": [ [ [ - 836.0 + 800.0 ] ] ], @@ -14495,7 +14495,7 @@ "inputs": [ [ [ - 7.625 + 6.3125 ] ] ], @@ -14511,7 +14511,7 @@ "inputs": [ [ [ - 29.125 + 22.5 ] ] ], @@ -14527,7 +14527,7 @@ "inputs": [ [ [ - 12.25 + 9.6875 ] ] ], @@ -14543,7 +14543,7 @@ "inputs": [ [ [ - 12.25 + 9.6875 ] ] ], @@ -14559,7 +14559,7 @@ "inputs": [ [ [ - 12.25 + 9.6875 ] ] ], @@ -14575,7 +14575,7 @@ "inputs": [ [ [ - 5.8125 + 7.71875 ] ] ], @@ -14591,12 +14591,12 @@ "inputs": [ [ [ - 9.875 + 8.75 ] ], [ [ - 11.5625 + 11.0 ] ] ] @@ -14605,12 +14605,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 7.25 + 7.90625 ] ] ] @@ -14619,14 +14619,14 @@ "inputs": [ [ [ - 412.0 + 360.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -14635,7 +14635,7 @@ "inputs": [ [ [ - 16.0 + 13.1875 ] ] ], @@ -14683,7 +14683,7 @@ "inputs": [ [ [ - 17.75 + 27.5 ] ] ], @@ -14704,7 +14704,7 @@ ], [ [ - 4.59375 + 4.75 ] ] ] @@ -14743,7 +14743,7 @@ "inputs": [ [ [ - 9.625 + 7.65625 ] ] ], @@ -14759,7 +14759,7 @@ "inputs": [ [ [ - 33.0 + 41.25 ] ] ], @@ -14775,7 +14775,7 @@ "inputs": [ [ [ - 34.75 + 26.375 ] ] ], @@ -14791,7 +14791,7 @@ "inputs": [ [ [ - 7.1875 + 5.125 ] ] ], @@ -14807,7 +14807,7 @@ "inputs": [ [ [ - 8.4375 + 6.71875 ] ] ], @@ -14823,7 +14823,7 @@ "inputs": [ [ [ - 8.4375 + 6.71875 ] ] ], @@ -14839,7 +14839,7 @@ "inputs": [ [ [ - 8.4375 + 6.71875 ] ] ], @@ -14855,7 +14855,7 @@ "inputs": [ [ [ - 4.90625 + 5.75 ] ] ], @@ -14871,12 +14871,12 @@ "inputs": [ [ [ - 9.125 + 8.625 ] ], [ [ - 9.3125 + 8.6875 ] ] ] @@ -14885,12 +14885,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 5.625 + 6.21875 ] ] ] @@ -14899,14 +14899,14 @@ "inputs": [ [ [ - 272.0 + 236.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -14915,7 +14915,7 @@ "inputs": [ [ [ - 13.3125 + 10.5625 ] ] ], @@ -14963,7 +14963,7 @@ "inputs": [ [ [ - 9.75 + 21.625 ] ] ], @@ -14984,7 +14984,7 @@ ], [ [ - 4.9375 + 4.5625 ] ] ] @@ -15007,7 +15007,7 @@ "inputs": [ [ [ - 984.0 + 1008.0 ] ] ], @@ -15023,7 +15023,7 @@ "inputs": [ [ [ - 7.5625 + 5.71875 ] ] ], @@ -15039,7 +15039,7 @@ "inputs": [ [ [ - 23.5 + 42.5 ] ] ], @@ -15055,7 +15055,7 @@ "inputs": [ [ [ - 9.8125 + 7.375 ] ] ], @@ -15071,7 +15071,7 @@ "inputs": [ [ [ - 9.8125 + 7.375 ] ] ], @@ -15087,7 +15087,7 @@ "inputs": [ [ [ - 9.8125 + 7.375 ] ] ], @@ -15103,7 +15103,7 @@ "inputs": [ [ [ - 6.28125 + 8.375 ] ] ], @@ -15119,12 +15119,12 @@ "inputs": [ [ [ - 9.5 + 9.3125 ] ], [ [ - 10.5 + 9.0 ] ] ] @@ -15133,12 +15133,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 6.625 + 8.375 ] ] ] @@ -15147,14 +15147,14 @@ "inputs": [ [ [ - 508.0 + 408.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -15163,7 +15163,7 @@ "inputs": [ [ [ - 13.8125 + 10.0 ] ] ], @@ -15211,7 +15211,7 @@ "inputs": [ [ [ - 10.875 + 22.5 ] ] ], @@ -15232,7 +15232,7 @@ ], [ [ - 4.65625 + 4.71875 ] ] ] @@ -15255,7 +15255,7 @@ "inputs": [ [ [ - 1376.0 + 1416.0 ] ] ], @@ -15271,7 +15271,7 @@ "inputs": [ [ [ - 7.90625 + 5.75 ] ] ], @@ -15287,7 +15287,7 @@ "inputs": [ [ [ - 68.5 + 90.5 ] ] ], @@ -15303,7 +15303,7 @@ "inputs": [ [ [ - 21.125 + 26.0 ] ] ], @@ -15319,7 +15319,7 @@ "inputs": [ [ [ - 9.375 + 6.5 ] ] ], @@ -15335,7 +15335,7 @@ "inputs": [ [ [ - 10.0 + 6.1875 ] ] ], @@ -15351,7 +15351,7 @@ "inputs": [ [ [ - 10.0 + 6.1875 ] ] ], @@ -15367,7 +15367,7 @@ "inputs": [ [ [ - 10.0 + 6.1875 ] ] ], @@ -15383,7 +15383,7 @@ "inputs": [ [ [ - 4.78125 + 6.65625 ] ] ], @@ -15399,12 +15399,12 @@ "inputs": [ [ [ - 7.875 + 8.75 ] ], [ [ - 9.5625 + 9.5 ] ] ] @@ -15413,12 +15413,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 6.90625 + 6.6875 ] ] ] @@ -15427,14 +15427,14 @@ "inputs": [ [ [ - 243.0 + 211.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -15443,7 +15443,7 @@ "inputs": [ [ [ - 14.75 + 11.4375 ] ] ], @@ -15491,7 +15491,7 @@ "inputs": [ [ [ - 14.25 + 24.375 ] ] ], @@ -15512,7 +15512,7 @@ ], [ [ - 3.4375 + 4.59375 ] ] ] @@ -15521,7 +15521,7 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ @@ -15535,14 +15535,14 @@ "inputs": [ [ [ - 968.0 + 936.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -15551,7 +15551,7 @@ "inputs": [ [ [ - 8.25 + 6.46875 ] ] ], @@ -15567,7 +15567,7 @@ "inputs": [ [ [ - 15.875 + 28.625 ] ] ], @@ -15583,7 +15583,7 @@ "inputs": [ [ [ - 9.0625 + 6.65625 ] ] ], @@ -15599,7 +15599,7 @@ "inputs": [ [ [ - 9.0625 + 6.65625 ] ] ], @@ -15615,7 +15615,7 @@ "inputs": [ [ [ - 9.0625 + 6.65625 ] ] ], @@ -15631,7 +15631,7 @@ "inputs": [ [ [ - 5.625 + 6.03125 ] ] ], @@ -15647,12 +15647,12 @@ "inputs": [ [ [ - 8.125 + 6.5625 ] ], [ [ - 10.375 + 7.21875 ] ] ] @@ -15661,12 +15661,12 @@ "inputs": [ [ [ - 0.98046875 + 1.0 ] ], [ [ - 6.53125 + 8.125 ] ] ] @@ -15675,14 +15675,14 @@ "inputs": [ [ [ - 314.0 + 168.0 ] ] ], "outputs": [ [ [ - 0.98046875 + 1.0 ] ] ] @@ -15691,7 +15691,7 @@ "inputs": [ [ [ - 14.75 + 10.375 ] ] ], @@ -15739,7 +15739,7 @@ "inputs": [ [ [ - 16.125 + 37.5 ] ] ], @@ -15760,7 +15760,7 @@ ], [ [ - 3.671875 + 3.921875 ] ] ] @@ -15783,7 +15783,7 @@ "inputs": [ [ [ - 1448.0 + 1136.0 ] ] ], @@ -15799,7 +15799,7 @@ "inputs": [ [ [ - 7.0 + 5.0 ] ] ], @@ -15815,7 +15815,7 @@ "inputs": [ [ [ - 29.375 + 37.0 ] ] ], @@ -15831,7 +15831,7 @@ "inputs": [ [ [ - 22.375 + 22.25 ] ] ], @@ -15847,7 +15847,7 @@ "inputs": [ [ [ - 12.625 + 6.75 ] ] ], @@ -15879,7 +15879,7 @@ "inputs": [ [ [ - 11.75 + 7.75 ] ] ], @@ -15895,7 +15895,7 @@ "inputs": [ [ [ - 146.0 + 143.0 ] ] ], @@ -15911,7 +15911,7 @@ "inputs": [ [ [ - 7.59375 + 5.28125 ] ] ], @@ -15943,7 +15943,7 @@ "inputs": [ [ [ - 7.90625 + 7.75 ] ] ], @@ -15959,7 +15959,7 @@ "inputs": [ [ [ - 42.5 + 48.0 ] ] ], @@ -15975,7 +15975,7 @@ "inputs": [ [ [ - 8.5 + 5.125 ] ] ], @@ -16007,7 +16007,7 @@ "inputs": [ [ [ - 8.0 + 7.6875 ] ] ], @@ -16023,7 +16023,7 @@ "inputs": [ [ [ - 31.25 + 33.75 ] ] ], @@ -16039,7 +16039,7 @@ "inputs": [ [ [ - 20.5 + 21.5 ] ] ], @@ -16055,7 +16055,7 @@ "inputs": [ [ [ - 9.5625 + 8.5 ] ] ], @@ -16087,7 +16087,7 @@ "inputs": [ [ [ - 12.5 + 12.6875 ] ] ], @@ -16103,7 +16103,7 @@ "inputs": [ [ [ - 54.25 + 68.0 ] ] ], @@ -16119,7 +16119,7 @@ "inputs": [ [ [ - 9.25 + 10.9375 ] ] ], @@ -16151,7 +16151,7 @@ "inputs": [ [ [ - 12.75 + 15.25 ] ] ], @@ -16167,7 +16167,7 @@ "inputs": [ [ [ - 25.5 + 28.0 ] ] ], @@ -16183,7 +16183,7 @@ "inputs": [ [ [ - 21.375 + 9.875 ] ] ], @@ -16215,7 +16215,7 @@ "inputs": [ [ [ - 9.5625 + 9.6875 ] ] ], @@ -16231,7 +16231,7 @@ "inputs": [ [ [ - 13.25 + 14.625 ] ] ], @@ -16247,7 +16247,7 @@ "inputs": [ [ [ - 8.5625 + 10.0625 ] ] ], @@ -16263,7 +16263,7 @@ "inputs": [ [ [ - 3.75 + 3.84375 ] ] ], @@ -16279,7 +16279,7 @@ "inputs": [ [ [ - 3.75 + 3.84375 ] ] ], @@ -16295,7 +16295,7 @@ "inputs": [ [ [ - 3.75 + 3.84375 ] ] ], @@ -16311,7 +16311,7 @@ "inputs": [ [ [ - 5.5625 + 6.375 ] ] ], @@ -16327,12 +16327,12 @@ "inputs": [ [ [ - 8.0625 + 7.625 ] ], [ [ - 8.375 + 8.0 ] ] ] @@ -16341,12 +16341,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 6.4375 + 6.625 ] ] ] @@ -16355,14 +16355,14 @@ "inputs": [ [ [ - 274.0 + 266.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -16371,7 +16371,7 @@ "inputs": [ [ [ - 11.0625 + 7.28125 ] ] ], @@ -16419,7 +16419,7 @@ "inputs": [ [ [ - 3.8125 + 19.5 ] ] ], @@ -16440,7 +16440,7 @@ ], [ [ - 4.59375 + 4.53125 ] ] ] @@ -16463,7 +16463,7 @@ "inputs": [ [ [ - 1216.0 + 1168.0 ] ] ], @@ -16479,7 +16479,7 @@ "inputs": [ [ [ - 2.640625 + 2.765625 ] ] ], @@ -16495,7 +16495,7 @@ "inputs": [ [ [ - 20.875 + 18.0 ] ] ], @@ -16511,7 +16511,7 @@ "inputs": [ [ [ - 6.53125 + 4.9375 ] ] ], @@ -16527,7 +16527,7 @@ "inputs": [ [ [ - 6.53125 + 4.9375 ] ] ], @@ -16543,7 +16543,7 @@ "inputs": [ [ [ - 6.53125 + 4.9375 ] ] ], @@ -16559,7 +16559,7 @@ "inputs": [ [ [ - 6.375 + 7.3125 ] ] ], @@ -16575,12 +16575,12 @@ "inputs": [ [ [ - 7.6875 + 7.4375 ] ], [ [ - 9.6875 + 9.25 ] ] ] @@ -16589,12 +16589,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 7.59375 + 7.8125 ] ] ] @@ -16603,14 +16603,14 @@ "inputs": [ [ [ - 215.0 + 247.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -16619,7 +16619,7 @@ "inputs": [ [ [ - 12.0625 + 7.53125 ] ] ], @@ -16667,7 +16667,7 @@ "inputs": [ [ [ - 4.78125 + 19.875 ] ] ], @@ -16688,7 +16688,7 @@ ], [ [ - 5.21875 + 5.125 ] ] ] @@ -16711,7 +16711,7 @@ "inputs": [ [ [ - 1120.0 + 988.0 ] ] ], @@ -16727,7 +16727,7 @@ "inputs": [ [ [ - 3.484375 + 3.125 ] ] ], @@ -16743,7 +16743,7 @@ "inputs": [ [ [ - 22.125 + 19.5 ] ] ], @@ -16759,7 +16759,7 @@ "inputs": [ [ [ - 8.0 + 5.84375 ] ] ], @@ -16775,7 +16775,7 @@ "inputs": [ [ [ - 8.0 + 5.84375 ] ] ], @@ -16791,7 +16791,7 @@ "inputs": [ [ [ - 8.0 + 5.84375 ] ] ], @@ -16807,7 +16807,7 @@ "inputs": [ [ [ - 6.09375 + 7.125 ] ] ], @@ -16823,12 +16823,12 @@ "inputs": [ [ [ - 7.90625 + 7.34375 ] ], [ [ - 9.0 + 7.625 ] ] ] @@ -16837,12 +16837,12 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ [ - 7.03125 + 7.25 ] ] ] @@ -16851,14 +16851,14 @@ "inputs": [ [ [ - 233.0 + 170.0 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -16867,7 +16867,7 @@ "inputs": [ [ [ - 13.25 + 7.5625 ] ] ], @@ -16915,7 +16915,7 @@ "inputs": [ [ [ - 5.59375 + 18.875 ] ] ], @@ -16936,7 +16936,7 @@ ], [ [ - 4.03125 + 3.9375 ] ] ] @@ -16959,7 +16959,7 @@ "inputs": [ [ [ - 864.0 + 780.0 ] ] ], @@ -16975,7 +16975,7 @@ "inputs": [ [ [ - 3.546875 + 3.5 ] ] ], @@ -16991,7 +16991,7 @@ "inputs": [ [ [ - 28.625 + 83.0 ] ] ], @@ -17007,7 +17007,7 @@ "inputs": [ [ [ - 8.9375 + 6.3125 ] ] ], @@ -17023,7 +17023,7 @@ "inputs": [ [ [ - 8.9375 + 6.3125 ] ] ], @@ -17039,7 +17039,7 @@ "inputs": [ [ [ - 8.9375 + 6.3125 ] ] ], @@ -17055,7 +17055,7 @@ "inputs": [ [ [ - 6.0 + 7.90625 ] ] ], @@ -17071,12 +17071,12 @@ "inputs": [ [ [ - 8.4375 + 7.5 ] ], [ [ - 8.875 + 6.625 ] ] ] @@ -17085,12 +17085,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 7.75 + 7.90625 ] ] ] @@ -17099,14 +17099,14 @@ "inputs": [ [ [ - 219.0 + 140.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -17115,7 +17115,7 @@ "inputs": [ [ [ - 12.875 + 6.6875 ] ] ], @@ -17163,7 +17163,7 @@ "inputs": [ [ [ - 13.0 + 23.375 ] ] ], @@ -17184,7 +17184,7 @@ ], [ [ - 3.921875 + 3.984375 ] ] ] @@ -17207,7 +17207,7 @@ "inputs": [ [ [ - 1184.0 + 1088.0 ] ] ], @@ -17223,7 +17223,7 @@ "inputs": [ [ [ - 4.34375 + 3.765625 ] ] ], @@ -17239,7 +17239,7 @@ "inputs": [ [ [ - 29.875 + 32.5 ] ] ], @@ -17255,7 +17255,7 @@ "inputs": [ [ [ - 8.1875 + 5.6875 ] ] ], @@ -17271,7 +17271,7 @@ "inputs": [ [ [ - 8.1875 + 5.6875 ] ] ], @@ -17287,7 +17287,7 @@ "inputs": [ [ [ - 8.1875 + 5.6875 ] ] ], @@ -17303,7 +17303,7 @@ "inputs": [ [ [ - 7.4375 + 8.0 ] ] ], @@ -17319,12 +17319,12 @@ "inputs": [ [ [ - 8.375 + 8.25 ] ], [ [ - 9.125 + 7.65625 ] ] ] @@ -17333,12 +17333,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 8.4375 + 8.0 ] ] ] @@ -17347,14 +17347,14 @@ "inputs": [ [ [ - 212.0 + 159.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -17363,7 +17363,7 @@ "inputs": [ [ [ - 11.8125 + 6.1875 ] ] ], @@ -17411,7 +17411,7 @@ "inputs": [ [ [ - 2.359375 + 22.125 ] ] ], @@ -17432,7 +17432,7 @@ ], [ [ - 3.640625 + 3.4375 ] ] ] @@ -17455,7 +17455,7 @@ "inputs": [ [ [ - 1080.0 + 928.0 ] ] ], @@ -17471,7 +17471,7 @@ "inputs": [ [ [ - 3.96875 + 3.640625 ] ] ], @@ -17487,7 +17487,7 @@ "inputs": [ [ [ - 37.25 + 23.875 ] ] ], @@ -17503,7 +17503,7 @@ "inputs": [ [ [ - 7.125 + 4.65625 ] ] ], @@ -17519,7 +17519,7 @@ "inputs": [ [ [ - 7.125 + 4.65625 ] ] ], @@ -17535,7 +17535,7 @@ "inputs": [ [ [ - 7.125 + 4.65625 ] ] ], @@ -17551,7 +17551,7 @@ "inputs": [ [ [ - 6.3125 + 7.3125 ] ] ], @@ -17572,7 +17572,7 @@ ], [ [ - 8.4375 + 7.59375 ] ] ] @@ -17581,12 +17581,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 7.375 + 7.75 ] ] ] @@ -17595,14 +17595,14 @@ "inputs": [ [ [ - 189.0 + 146.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -17611,7 +17611,7 @@ "inputs": [ [ [ - 9.625 + 5.09375 ] ] ], @@ -17659,7 +17659,7 @@ "inputs": [ [ [ - 2.546875 + 27.25 ] ] ], @@ -17680,7 +17680,7 @@ ], [ [ - 3.46875 + 3.09375 ] ] ] @@ -17703,7 +17703,7 @@ "inputs": [ [ [ - 1072.0 + 852.0 ] ] ], @@ -17719,7 +17719,7 @@ "inputs": [ [ [ - 4.34375 + 4.125 ] ] ], @@ -17735,7 +17735,7 @@ "inputs": [ [ [ - 34.25 + 26.25 ] ] ], @@ -17751,7 +17751,7 @@ "inputs": [ [ [ - 6.40625 + 4.78125 ] ] ], @@ -17767,7 +17767,7 @@ "inputs": [ [ [ - 6.40625 + 4.78125 ] ] ], @@ -17783,7 +17783,7 @@ "inputs": [ [ [ - 6.40625 + 4.78125 ] ] ], @@ -17799,7 +17799,7 @@ "inputs": [ [ [ - 4.5 + 6.46875 ] ] ], @@ -17815,12 +17815,12 @@ "inputs": [ [ [ - 7.0 + 6.4375 ] ], [ [ - 8.125 + 6.09375 ] ] ] @@ -17829,12 +17829,12 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ [ - 7.03125 + 7.0625 ] ] ] @@ -17843,14 +17843,14 @@ "inputs": [ [ [ - 175.0 + 121.5 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -17859,7 +17859,7 @@ "inputs": [ [ [ - 8.125 + 4.8125 ] ] ], @@ -17907,7 +17907,7 @@ "inputs": [ [ [ - 6.75 + 31.5 ] ] ], @@ -17928,7 +17928,7 @@ ], [ [ - 3.90625 + 3.484375 ] ] ] @@ -17937,7 +17937,7 @@ "inputs": [ [ [ - 0.984375 + 1.0 ] ], [ @@ -17951,14 +17951,14 @@ "inputs": [ [ [ - 1528.0 + 1288.0 ] ] ], "outputs": [ [ [ - 0.984375 + 1.0 ] ] ] @@ -17967,7 +17967,7 @@ "inputs": [ [ [ - 4.46875 + 4.3125 ] ] ], @@ -17983,7 +17983,7 @@ "inputs": [ [ [ - 44.5 + 40.75 ] ] ], @@ -17999,7 +17999,7 @@ "inputs": [ [ [ - 6.15625 + 4.9375 ] ] ], @@ -18015,7 +18015,7 @@ "inputs": [ [ [ - 6.15625 + 4.9375 ] ] ], @@ -18031,7 +18031,7 @@ "inputs": [ [ [ - 6.15625 + 4.9375 ] ] ], @@ -18047,7 +18047,7 @@ "inputs": [ [ [ - 7.21875 + 7.34375 ] ] ], @@ -18063,12 +18063,12 @@ "inputs": [ [ [ - 7.03125 + 6.8125 ] ], [ [ - 7.6875 + 6.65625 ] ] ] @@ -18077,12 +18077,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 8.5 + 7.59375 ] ] ] @@ -18091,14 +18091,14 @@ "inputs": [ [ [ - 153.0 + 122.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -18107,7 +18107,7 @@ "inputs": [ [ [ - 6.5625 + 4.5625 ] ] ], @@ -18155,7 +18155,7 @@ "inputs": [ [ [ - 10.375 + 28.75 ] ] ], @@ -18176,7 +18176,7 @@ ], [ [ - 3.40625 + 2.875 ] ] ] @@ -18185,7 +18185,7 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ @@ -18199,14 +18199,14 @@ "inputs": [ [ [ - 1080.0 + 912.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -18215,7 +18215,7 @@ "inputs": [ [ [ - 4.75 + 4.3125 ] ] ], @@ -18231,7 +18231,7 @@ "inputs": [ [ [ - 42.0 + 28.625 ] ] ], @@ -18247,7 +18247,7 @@ "inputs": [ [ [ - 5.625 + 4.875 ] ] ], @@ -18263,7 +18263,7 @@ "inputs": [ [ [ - 5.625 + 4.875 ] ] ], @@ -18279,7 +18279,7 @@ "inputs": [ [ [ - 5.625 + 4.875 ] ] ], @@ -18295,7 +18295,7 @@ "inputs": [ [ [ - 8.6875 + 8.0625 ] ] ], @@ -18311,12 +18311,12 @@ "inputs": [ [ [ - 7.84375 + 6.84375 ] ], [ [ - 8.625 + 7.8125 ] ] ] @@ -18325,12 +18325,12 @@ "inputs": [ [ [ - 0.99609375 + 1.0 ] ], [ [ - 11.375 + 8.125 ] ] ] @@ -18339,14 +18339,14 @@ "inputs": [ [ [ - 179.0 + 142.0 ] ] ], "outputs": [ [ [ - 0.99609375 + 1.0 ] ] ] @@ -18355,7 +18355,7 @@ "inputs": [ [ [ - 5.96875 + 4.53125 ] ] ], @@ -18403,7 +18403,7 @@ "inputs": [ [ [ - 1.7109375 + 24.5 ] ] ], @@ -18424,7 +18424,7 @@ ], [ [ - 3.59375 + 2.984375 ] ] ] @@ -18433,7 +18433,7 @@ "inputs": [ [ [ - 0.9921875 + 1.0 ] ], [ @@ -18447,14 +18447,14 @@ "inputs": [ [ [ - 2016.0 + 1584.0 ] ] ], "outputs": [ [ [ - 0.9921875 + 1.0 ] ] ] @@ -18463,7 +18463,7 @@ "inputs": [ [ [ - 4.90625 + 4.40625 ] ] ], @@ -18479,7 +18479,7 @@ "inputs": [ [ [ - 56.75 + 32.25 ] ] ], @@ -18495,7 +18495,7 @@ "inputs": [ [ [ - 4.96875 + 4.875 ] ] ], @@ -18511,7 +18511,7 @@ "inputs": [ [ [ - 4.96875 + 4.875 ] ] ], @@ -18527,7 +18527,7 @@ "inputs": [ [ [ - 4.96875 + 4.875 ] ] ], @@ -18543,7 +18543,7 @@ "inputs": [ [ [ - 4.90625 + 8.3125 ] ] ], @@ -18559,12 +18559,12 @@ "inputs": [ [ [ - 7.15625 + 6.40625 ] ], [ [ - 7.71875 + 7.0 ] ] ] @@ -18578,7 +18578,7 @@ ], [ [ - 8.25 + 8.4375 ] ] ] @@ -18587,7 +18587,7 @@ "inputs": [ [ [ - 153.0 + 117.5 ] ] ], @@ -18603,7 +18603,7 @@ "inputs": [ [ [ - 5.0625 + 4.6875 ] ] ], @@ -18651,7 +18651,7 @@ "inputs": [ [ [ - 1.0078125 + 21.0 ] ] ], @@ -18672,7 +18672,7 @@ ], [ [ - 3.671875 + 3.046875 ] ] ] @@ -18681,7 +18681,7 @@ "inputs": [ [ [ - 0.90234375 + 1.0 ] ], [ @@ -18695,14 +18695,14 @@ "inputs": [ [ [ - 2224.0 + 1848.0 ] ] ], "outputs": [ [ [ - 0.90234375 + 1.0 ] ] ] @@ -18711,7 +18711,7 @@ "inputs": [ [ [ - 4.40625 + 4.5 ] ] ], @@ -18727,7 +18727,7 @@ "inputs": [ [ [ - 51.75 + 50.5 ] ] ], @@ -18743,7 +18743,7 @@ "inputs": [ [ [ - 26.0 + 33.5 ] ] ], @@ -18759,7 +18759,7 @@ "inputs": [ [ [ - 9.75 + 5.5 ] ] ], @@ -18791,7 +18791,7 @@ "inputs": [ [ [ - 11.125 + 8.3125 ] ] ], @@ -18807,7 +18807,7 @@ "inputs": [ [ [ - 8.5625 + 6.53125 ] ] ], @@ -18839,7 +18839,7 @@ "inputs": [ [ [ - 13.3125 + 10.875 ] ] ], @@ -18855,7 +18855,7 @@ "inputs": [ [ [ - 11.75 + 8.25 ] ] ], diff --git a/examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.npz b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.npz similarity index 68% rename from examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.npz rename to examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.npz index 2e6ad5c196..ad009dc768 100644 Binary files a/examples/stable-diffusion/quantization/measure/fp8_hooks_maxabs.npz and b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure/fp8_hooks_maxabs.npz differ diff --git a/examples/stable-diffusion/quantization/measure_config.json b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure_config.json old mode 100755 new mode 100644 similarity index 52% rename from examples/stable-diffusion/quantization/measure_config.json rename to examples/stable-diffusion/quantization/stable-diffusion-xl/measure_config.json index 04576eeb46..5a250cad7c --- a/examples/stable-diffusion/quantization/measure_config.json +++ b/examples/stable-diffusion/quantization/stable-diffusion-xl/measure_config.json @@ -2,5 +2,5 @@ "method": "HOOKS", "mode": "MEASURE", "observer": "maxabs", - "dump_stats_path": "./quantization/measure/fp8" + "dump_stats_path": "quantization/stable-diffusion-xl/measure/fp8" } diff --git a/examples/stable-diffusion/quantization/quant_config.json b/examples/stable-diffusion/quantization/stable-diffusion-xl/quantize_config.json old mode 100755 new mode 100644 similarity index 60% rename from examples/stable-diffusion/quantization/quant_config.json rename to examples/stable-diffusion/quantization/stable-diffusion-xl/quantize_config.json index b372905d7f..5d686e659d --- a/examples/stable-diffusion/quantization/quant_config.json +++ b/examples/stable-diffusion/quantization/stable-diffusion-xl/quantize_config.json @@ -3,5 +3,5 @@ "mode": "QUANTIZE", "observer": "maxabs", "scale_method": "maxabs_hw", - "dump_stats_path": "./quantization/measure/fp8" -} \ No newline at end of file + "dump_stats_path": "quantization/stable-diffusion-xl/measure/fp8" +} diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py index 8fd48c99a8..8eea534e01 100755 --- a/examples/stable-diffusion/text_to_image_generation.py +++ b/examples/stable-diffusion/text_to_image_generation.py @@ -42,7 +42,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") logger = logging.getLogger(__name__) @@ -305,6 +305,12 @@ def main(): default=None, help="The file with prompts (for large number of images generation).", ) + parser.add_argument( + "--lora_scale", + type=float, + default=None, + help="A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.", + ) args = parser.parse_args() if args.optimize and not args.use_habana: @@ -380,6 +386,9 @@ def main(): if args.throughput_warmup_steps is not None: kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps + if args.lora_scale is not None: + kwargs_call["lora_scale"] = args.lora_scale + negative_prompts = args.negative_prompts if args.distributed: distributed_state = PartialState() @@ -440,11 +449,8 @@ def main(): kwargs_call["quant_mode"] = args.quant_mode - if args.quant_mode != "disable": - # Import htcore here to support model quantization - import habana_frameworks.torch.core as htcore # noqa: F401 - # Instantiate a Stable Diffusion pipeline class + quant_config_path = os.getenv("QUANT_CONFIG") if sdxl: # SDXL pipelines if controlnet: @@ -475,7 +481,6 @@ def main(): pipeline.unet.set_default_attn_processor(pipeline.unet) pipeline.to(torch.device("hpu")) - quant_config_path = os.getenv("QUANT_CONFIG") if quant_config_path: import habana_frameworks.torch.core as htcore from neural_compressor.torch.quantization import FP8Config, convert, prepare @@ -503,9 +508,6 @@ def main(): **kwargs, ) - if args.lora_id: - pipeline.load_lora_weights(args.lora_id) - elif sd3: # SD3 pipelines if controlnet: @@ -524,6 +526,7 @@ def main(): args.model_name_or_path, **kwargs, ) + elif flux: # Flux pipelines if controlnet: @@ -554,8 +557,6 @@ def main(): controlnet=controlnet, **kwargs, ) - if args.lora_id: - pipeline.load_lora_weights(args.lora_id) elif inpainting: # SD Inpainting pipeline @@ -599,6 +600,10 @@ def main(): **kwargs, ) + # Load LoRA weights if provided + if args.lora_id: + pipeline.load_lora_weights(args.lora_id) + # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -687,12 +692,12 @@ def main(): logger.info(f"Saving images in {image_save_dir.resolve()}...") if args.ldm3d: for i, rgb in enumerate(outputs.rgb): - rgb.save(image_save_dir / f"rgb_{i+1}.png") + rgb.save(image_save_dir / f"rgb_{i + 1}.png") for i, depth in enumerate(outputs.depth): - depth.save(image_save_dir / f"depth_{i+1}.png") + depth.save(image_save_dir / f"depth_{i + 1}.png") else: for i, image in enumerate(outputs.images): - image.save(image_save_dir / f"image_{i+1}.png") + image.save(image_save_dir / f"image_{i + 1}.png") else: logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.") diff --git a/examples/text-to-video/text_to_video_generation.py b/examples/stable-diffusion/text_to_video_generation.py similarity index 99% rename from examples/text-to-video/text_to_video_generation.py rename to examples/stable-diffusion/text_to_video_generation.py index 8813e321cf..bde142a2c0 100755 --- a/examples/text-to-video/text_to_video_generation.py +++ b/examples/stable-diffusion/text_to_video_generation.py @@ -37,7 +37,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/stable-diffusion/training/README.md b/examples/stable-diffusion/training/README.md index afa4a0a61f..4ea85c9e36 100644 --- a/examples/stable-diffusion/training/README.md +++ b/examples/stable-diffusion/training/README.md @@ -18,61 +18,71 @@ limitations under the License. This directory contains scripts that showcase how to perform training/fine-tuning of Stable Diffusion models on Habana Gaudi. - ## Textual Inversion [Textual Inversion](https://arxiv.org/abs/2208.01618) is a method to personalize text2image models like Stable Diffusion on your own images using just 3-5 examples. -The `textual_inversion.py` script shows how to implement the training procedure on Habana Gaudi. - -### Cat Toy Example +The `textual_inversion.py` script shows how to implement the training procedure on Habana Gaudi. In the examples below, we will use a set of cat images from the following dataset: [https://huggingface.co/datasets/diffusers/cat_toy_example](https://huggingface.co/datasets/diffusers/cat_toy_example) -Let's first download this dataset locally: - -```python -from huggingface_hub import snapshot_download -from pathlib import Path -import shutil - -local_dir = './cat' -snapshot_download( - 'diffusers/cat_toy_example', - local_dir=local_dir, - repo_type='dataset', - ignore_patterns='.gitattributes', -) -cache_dir = Path(local_dir, '.cache') -if cache_dir.is_dir(): - shutil.rmtree(cache_dir) +To download this and other example training datasets locally, run: +```bash +python download_train_datasets.py ``` -This will be our training data. Now we can launch the training using: ```bash python textual_inversion.py \ - --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \ - --train_data_dir ./cat \ - --learnable_property object \ - --placeholder_token "" \ - --initializer_token toy \ - --resolution 512 \ - --train_batch_size 4 \ - --max_train_steps 3000 \ - --learning_rate 5.0e-04 \ - --scale_lr \ - --lr_scheduler constant \ - --lr_warmup_steps 0 \ - --output_dir /tmp/textual_inversion_cat \ - --save_as_full_pipeline \ - --gaudi_config_name Habana/stable-diffusion \ - --throughput_warmup_steps 3 + --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \ + --train_data_dir ./cat \ + --learnable_property object \ + --placeholder_token "" \ + --initializer_token toy \ + --resolution 512 \ + --train_batch_size 4 \ + --max_train_steps 3000 \ + --learning_rate 5.0e-04 \ + --scale_lr \ + --lr_scheduler constant \ + --lr_warmup_steps 0 \ + --output_dir /tmp/textual_inversion_cat \ + --save_as_full_pipeline \ + --gaudi_config_name Habana/stable-diffusion \ + --throughput_warmup_steps 3 ``` -The following example shows how to run inference using the fine-tuned model: +> [!NOTE] +> Change `--resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model. + +> [!NOTE] +> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, *e.g.* `""`. +> However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable parameters. +> This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to a number larger than one, +> *e.g.*: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case. + +Once you have trained a model as described above, inference can be done using `GaudiStableDiffusionPipeline`. +Please make sure to include the `placeholder_token` in your prompt so that textual inversion guided inference can take effect. + +You can use `text_to_image_generation.py` sample to run inference with the fine-tuned model: + +```bash +python ../text_to_image_generation.py \ + --model_name_or_path /tmp/textual_inversion_cat \ + --prompts "A backpack" \ + --num_images_per_prompt 5 \ + --batch_size 1 \ + --image_save_dir /tmp/textual_inversion_cat_images \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 +``` + +Alternatively, you can run inference with the fine-tuned model using a simple Python script like this: ```python from optimum.habana.diffusers import GaudiStableDiffusionPipeline @@ -85,6 +95,7 @@ pipe = GaudiStableDiffusionPipeline.from_pretrained( use_habana=True, use_hpu_graphs=True, gaudi_config="Habana/stable-diffusion", + sdp_on_bf16=True, ) prompt = "A backpack" @@ -92,14 +103,6 @@ image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0] image.save(f"cat-backpack.png") ``` -> Change `--resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model. - -> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, *e.g.* `""`. -> However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable parameters. -> This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to a number larger than one, -> *e.g.*: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case. - - ## Textual Inversion XL The `textual_inversion_sdxl.py` script shows how to implement textual inversion fine-tuning on Gaudi for XL diffusion models @@ -109,32 +112,52 @@ Assuming the afforemenioned cat toy dataset has been obtained, we can launch tex ```bash python textual_inversion_sdxl.py \ - --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ - --train_data_dir ./cat \ - --learnable_property object \ - --placeholder_token "" \ - --initializer_token toy \ - --resolution 768 \ - --train_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --max_train_steps 500 \ - --learning_rate 5.0e-04 \ - --scale_lr \ - --lr_scheduler constant \ - --lr_warmup_steps 0 \ - --output_dir /tmp/textual_inversion_cat_sdxl \ - --save_as_full_pipeline \ - --gaudi_config_name Habana/stable-diffusion \ - --throughput_warmup_steps 3 + --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ + --train_data_dir ./cat \ + --learnable_property object \ + --placeholder_token "" \ + --initializer_token toy \ + --resolution 768 \ + --train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --max_train_steps 500 \ + --learning_rate 5.0e-04 \ + --scale_lr \ + --lr_scheduler constant \ + --lr_warmup_steps 0 \ + --output_dir /tmp/textual_inversion_cat_sdxl \ + --save_as_full_pipeline \ + --gaudi_config_name Habana/stable-diffusion \ + --throughput_warmup_steps 3 ``` -> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, *e.g.* `""`. -> However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable parameters. -> This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to a number larger than one, -> *e.g.*: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case. +> [!NOTE] +> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, +> e.g. `""`. However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable +> parameters. This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to +> a number larger than one, e.g.: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case. The script also supports training of both text encoders of SDXL, so inference can be executed by inserting a placeholder token into one or both prompts. -The following example shows how to run inference using the fine tuned-model with both text encoders, separately and in combination: + +For example, after training you can use `text_to_image_generation.py` sample to run inference with the fine-tuned model as follows: + +```bash +python ../text_to_image_generation.py \ + --model_name_or_path /tmp/textual_inversion_cat_sdxl \ + --prompts "A backpack" \ + --num_images_per_prompt 5 \ + --batch_size 1 \ + --image_save_dir /tmp/textual_inversion_cat_sdxl_images \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 +``` + +Alternatively, you can run inference with the fine-tuned model using a simple standalone Python script. +The following script can be used to run inference using the fine-tuned model with both text encoders, +separately and in combination: ```python from optimum.habana.diffusers import GaudiStableDiffusionXLPipeline @@ -147,6 +170,7 @@ pipe = GaudiStableDiffusionXLPipeline.from_pretrained( use_habana=True, use_hpu_graphs=True, gaudi_config="Habana/stable-diffusion", + sdp_on_bf16=True, ) prompt = "A backpack" @@ -161,73 +185,77 @@ image = pipe(prompt=prompt, prompt_2=prompt_2, num_inference_steps=50, guidance_ image.save(f"cat-backpack_p1and2.png") ``` -> [!NOTE] -> Change `--resolution` to 768 if you are using [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model. - -> [!NOTE] -> As described in [the official paper](https://arxiv.org/abs/2208.01618), only one embedding vector is used for the placeholder token, -> e.g. `""`. However, one can also add multiple embedding vectors for the placeholder token to increase the number of fine-tuneable -> parameters. This can help the model to learn more complex details. To use multiple embedding vectors, you can define `--num_vectors` to -> a number larger than one, e.g.: `--num_vectors 5`. The saved textual inversion vectors will then be larger in size compared to the default case. - - ## ControlNet Training ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models ](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala. It is a type of model for controlling StableDiffusion by conditioning the model with an additional input image. This example is adapted from [controlnet example in the diffusers repository](https://github.com/huggingface/diffusers/tree/main/examples/controlnet#training). -First, download the conditioning images as shown below: - +To download the example conditioning images locally, run: ```bash -wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png -wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_2.png +python download_train_datasets.py ``` Then proceed to training with command: ```bash python train_controlnet.py \ - --pretrained_model_name_or_path=CompVis/stable-diffusion-v1-4\ - --output_dir=/tmp/stable_diffusion1_5 \ - --dataset_name=fusing/fill50k \ - --resolution=512 \ - --learning_rate=1e-5 \ - --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \ - --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ - --train_batch_size=4 \ - --throughput_warmup_steps=3 \ - --use_hpu_graphs \ - --sdp_on_bf16 \ - --bf16 \ - --trust_remote_code + --pretrained_model_name_or_path=CompVis/stable-diffusion-v1-4\ + --output_dir=/tmp/stable_diffusion1_4 \ + --dataset_name=fusing/fill50k \ + --resolution=512 \ + --learning_rate=1e-5 \ + --validation_image "./cnet/conditioning_image_1.png" "./cnet/conditioning_image_2.png" \ + --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ + --train_batch_size=4 \ + --throughput_warmup_steps=3 \ + --use_hpu_graphs \ + --sdp_on_bf16 \ + --bf16 \ + --trust_remote_code ``` -### Multi-card Run +### Multi-Card Training You can run these fine-tuning scripts in a distributed fashion as follows: ```bash python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \ - --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \ - --output_dir=/tmp/stable_diffusion1_5 \ - --dataset_name=fusing/fill50k \ - --resolution=512 \ - --learning_rate=1e-5 \ - --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \ - --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ - --train_batch_size=4 \ - --throughput_warmup_steps 3 \ - --use_hpu_graphs \ - --sdp_on_bf16 \ - --bf16 \ - --trust_remote_code + --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \ + --output_dir=/tmp/stable_diffusion1_4 \ + --dataset_name=fusing/fill50k \ + --resolution=512 \ + --learning_rate=1e-5 \ + --validation_image "./cnet/conditioning_image_1.png" "./cnet/conditioning_image_2.png" \ + --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ + --train_batch_size=4 \ + --throughput_warmup_steps 3 \ + --use_hpu_graphs \ + --sdp_on_bf16 \ + --bf16 \ + --trust_remote_code ``` - ### Inference -Once you have trained a model as described right above, inference can be done simply using the `GaudiStableDiffusionPipeline`. -Make sure to include the `placeholder_token` in your prompt. +After training completes, you can use `text_to_image_generation.py` sample to run inference with the fine-tuned ControlNet model: + +```bash +python ../text_to_image_generation.py \ + --model_name_or_path CompVis/stable-diffusion-v1-4 \ + --controlnet_model_name_or_path /tmp/stable_diffusion1_4 \ + --prompts "pale golden rod circle with old lace background" \ + --control_image "./cnet/conditioning_image_1.png" \ + --num_images_per_prompt 5 \ + --batch_size 1 \ + --image_save_dir /tmp/controlnet_images \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 +``` + +Alternatively, you can run inference using a simple standalone Python script, as shown below: ```python from diffusers import ControlNetModel, UniPCMultistepScheduler @@ -236,7 +264,7 @@ import torch from optimum.habana.diffusers import GaudiStableDiffusionControlNetPipeline base_model_path = "CompVis/stable-diffusion-v1-4" -controlnet_path = "/tmp/stable_diffusion1_5" +controlnet_path = "/tmp/stable_diffusion1_4" controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.bfloat16) pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained( @@ -246,12 +274,13 @@ pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained( use_habana=True, use_hpu_graphs=True, gaudi_config="Habana/stable-diffusion", + sdp_on_bf16=True, ) # speed up diffusion process with faster scheduler and memory optimization pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) -control_image = load_image("./conditioning_image_1.png") +control_image = load_image("./cnet/conditioning_image_1.png") prompt = "pale golden rod circle with old lace background" # generate image @@ -262,7 +291,6 @@ image = pipe( image.save("./output.png") ``` - ## Fine-Tuning for Stable Diffusion XL The `train_text_to_image_sdxl.py` script shows how to implement the fine-tuning of Stable Diffusion XL models on Gaudi. @@ -274,103 +302,102 @@ Install the requirements: pip install -r requirements.txt ``` -### Single-card Training +### Single Card Training To train Stable Diffusion XL on a single Gaudi card, use: ```bash python train_text_to_image_sdxl.py \ - --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ - --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \ - --dataset_name lambdalabs/naruto-blip-captions \ - --resolution 512 \ - --crop_resolution 512 \ - --center_crop \ - --random_flip \ - --proportion_empty_prompts=0.2 \ - --train_batch_size 16 \ - --max_train_steps 2500 \ - --learning_rate 1e-05 \ - --max_grad_norm 1 \ - --lr_scheduler constant \ - --lr_warmup_steps 0 \ - --output_dir sdxl_model_output \ - --gaudi_config_name Habana/stable-diffusion \ - --throughput_warmup_steps 3 \ - --dataloader_num_workers 8 \ - --sdp_on_bf16 \ - --bf16 \ - --use_hpu_graphs_for_training \ - --use_hpu_graphs_for_inference \ - --validation_prompt="a cute naruto creature" \ - --validation_epochs 48 \ - --checkpointing_steps 2500 \ - --logging_step 10 \ - --adjust_throughput + --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ + --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \ + --dataset_name lambdalabs/naruto-blip-captions \ + --resolution 512 \ + --crop_resolution 512 \ + --center_crop \ + --random_flip \ + --proportion_empty_prompts=0.2 \ + --train_batch_size 16 \ + --max_train_steps 2500 \ + --learning_rate 1e-05 \ + --max_grad_norm 1 \ + --lr_scheduler constant \ + --lr_warmup_steps 0 \ + --output_dir sdxl_model_output \ + --gaudi_config_name Habana/stable-diffusion \ + --throughput_warmup_steps 3 \ + --dataloader_num_workers 8 \ + --sdp_on_bf16 \ + --bf16 \ + --use_hpu_graphs_for_training \ + --use_hpu_graphs_for_inference \ + --validation_prompt="a cute naruto creature" \ + --validation_epochs 48 \ + --checkpointing_steps 2500 \ + --logging_step 10 \ + --adjust_throughput ``` - ### Multi-Card Training To train Stable Diffusion XL on a multi-card Gaudi system, use: ```bash PT_HPU_RECIPE_CACHE_CONFIG=/tmp/stdxl_recipe_cache,True,1024 \ python ../../gaudi_spawn.py --world_size 8 --use_mpi train_text_to_image_sdxl.py \ - --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ - --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \ - --dataset_name lambdalabs/naruto-blip-captions \ - --resolution 512 \ - --crop_resolution 512 \ - --center_crop \ - --random_flip \ - --proportion_empty_prompts=0.2 \ - --train_batch_size 16 \ - --max_train_steps 336 \ - --learning_rate 1e-05 \ - --max_grad_norm 1 \ - --lr_scheduler constant \ - --lr_warmup_steps 0 \ - --output_dir sdxl_model_output \ - --gaudi_config_name Habana/stable-diffusion \ - --throughput_warmup_steps 3 \ - --dataloader_num_workers 8 \ - --sdp_on_bf16 \ - --bf16 \ - --use_hpu_graphs_for_training \ - --use_hpu_graphs_for_inference \ - --validation_prompt="a cute naruto creature" \ - --validation_epochs 48 \ - --checkpointing_steps 336 \ - --mediapipe dataset_sdxl_mediapipe \ - --adjust_throughput + --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ + --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \ + --dataset_name lambdalabs/naruto-blip-captions \ + --resolution 512 \ + --crop_resolution 512 \ + --center_crop \ + --random_flip \ + --proportion_empty_prompts=0.2 \ + --train_batch_size 16 \ + --max_train_steps 336 \ + --learning_rate 1e-05 \ + --max_grad_norm 1 \ + --lr_scheduler constant \ + --lr_warmup_steps 0 \ + --output_dir sdxl_model_output \ + --gaudi_config_name Habana/stable-diffusion \ + --throughput_warmup_steps 3 \ + --dataloader_num_workers 8 \ + --sdp_on_bf16 \ + --bf16 \ + --use_hpu_graphs_for_training \ + --use_hpu_graphs_for_inference \ + --validation_prompt="a cute naruto creature" \ + --validation_epochs 48 \ + --checkpointing_steps 336 \ + --mediapipe dataset_sdxl_mediapipe \ + --adjust_throughput ``` -### Single-Card Training on Gaudi1 +### Single Card Training on Gaudi1 To train Stable Diffusion XL on a single Gaudi1 card, use: ```bash python train_text_to_image_sdxl.py \ - --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ - --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \ - --dataset_name lambdalabs/naruto-blip-captions \ - --resolution 256 \ - --center_crop \ - --random_flip \ - --proportion_empty_prompts=0.2 \ - --train_batch_size 1 \ - --gradient_accumulation_steps 4 \ - --max_train_steps 3000 \ - --learning_rate 1e-05 \ - --max_grad_norm 1 \ - --lr_scheduler constant \ - --lr_warmup_steps 0 \ - --output_dir sdxl_model_output \ - --gaudi_config_name Habana/stable-diffusion \ - --throughput_warmup_steps 3 \ - --use_hpu_graphs_for_training \ - --use_hpu_graphs_for_inference \ - --checkpointing_steps 3000 \ - --sdp_on_bf16 \ - --bf16 + --pretrained_model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ + --pretrained_vae_model_name_or_path madebyollin/sdxl-vae-fp16-fix \ + --dataset_name lambdalabs/naruto-blip-captions \ + --resolution 256 \ + --center_crop \ + --random_flip \ + --proportion_empty_prompts=0.2 \ + --train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --max_train_steps 3000 \ + --learning_rate 1e-05 \ + --max_grad_norm 1 \ + --lr_scheduler constant \ + --lr_warmup_steps 0 \ + --output_dir sdxl_model_output \ + --gaudi_config_name Habana/stable-diffusion \ + --throughput_warmup_steps 3 \ + --use_hpu_graphs_for_training \ + --use_hpu_graphs_for_inference \ + --checkpointing_steps 3000 \ + --sdp_on_bf16 \ + --bf16 ``` > [!NOTE] @@ -380,6 +407,24 @@ python train_text_to_image_sdxl.py \ > [!NOTE] > `--mediapipe` only works on Gaudi2. +### Inference + +After training is finished, you can run inference using `text_to_image_generation.py` script as follows: + +```bash +python ../text_to_image_generation.py \ + --model_name_or_path sdxl_model_output \ + --prompts "a cute naruto creature" \ + --num_images_per_prompt 5 \ + --batch_size 1 \ + --image_save_dir /tmp/stable_diffusion_xl_images \ + --scheduler euler_discrete \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 +``` ## DreamBooth @@ -387,28 +432,12 @@ DreamBooth is a technique for personalizing text-to-image models like Stable Dif of a specific subject. The `train_dreambooth.py` script demonstrates how to implement this training process and adapt it for Stable Diffusion. -### Dog Toy Example - For DreamBooth examples we will use a set of dog images from the following dataset: [https://huggingface.co/datasets/diffusers/dog-example](https://huggingface.co/datasets/diffusers/dog-example). -Let's first download this dataset locally: - -```python -from huggingface_hub import snapshot_download -from pathlib import Path -import shutil - -local_dir = './dog' -snapshot_download( - 'diffusers/dog-example', - local_dir=local_dir, - repo_type='dataset', - ignore_patterns='.gitattributes', -) -cache_dir = Path(local_dir, '.cache') -if cache_dir.is_dir(): - shutil.rmtree(cache_dir) +To download this and other example training datasets locally, run: +```bash +python download_train_datasets.py ``` ### Full Model Fine-Tuning @@ -416,26 +445,26 @@ if cache_dir.is_dir(): To launch the multi-card Stable Diffusion training, use: ```bash python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth.py \ - --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" \ - --instance_data_dir="dog" \ - --output_dir="dog_sd" \ - --class_data_dir="path-to-class-images" \ - --with_prior_preservation --prior_loss_weight=1.0 \ - --instance_prompt="a photo of sks dog" \ - --class_prompt="a photo of dog" \ - --resolution=512 \ - --train_batch_size=1 \ - --num_class_images=200 \ - --gradient_accumulation_steps=1 \ - --learning_rate=5e-6 \ - --lr_scheduler="constant" \ - --lr_warmup_steps=0 \ - --max_train_steps=800 \ - --mixed_precision=bf16 \ - --use_hpu_graphs_for_training \ - --use_hpu_graphs_for_inference \ - --gaudi_config_name Habana/stable-diffusion \ - full + --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" \ + --instance_data_dir="dog" \ + --output_dir="dog_sd" \ + --class_data_dir="path-to-class-images" \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --num_class_images=200 \ + --gradient_accumulation_steps=1 \ + --learning_rate=5e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=800 \ + --mixed_precision=bf16 \ + --use_hpu_graphs_for_training \ + --use_hpu_graphs_for_inference \ + --gaudi_config_name Habana/stable-diffusion \ + full ``` Prior preservation is used to prevent overfitting and language drift. For more details, refer to the original paper. @@ -453,27 +482,27 @@ UNet or text encoder. To run the multi-card training, use: ```bash python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth.py \ - --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" \ - --instance_data_dir="dog" \ - --output_dir="dog_sd" \ - --class_data_dir="path-to-class-images" \ - --with_prior_preservation \ - --prior_loss_weight=1.0 \ - --instance_prompt="a photo of sks dog" \ - --class_prompt="a photo of dog" \ - --resolution=512 \ - --train_batch_size=1 \ - --num_class_images=200 \ - --gradient_accumulation_steps=1 \ - --learning_rate=1e-4 \ - --lr_scheduler="constant" \ - --lr_warmup_steps=0 \ - --max_train_steps=800 \ - --mixed_precision=bf16 \ - --use_hpu_graphs_for_training \ - --use_hpu_graphs_for_inference \ - --gaudi_config_name Habana/stable-diffusion \ - lora --unet_r 8 --unet_alpha 8 + --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" \ + --instance_data_dir="dog" \ + --output_dir="dog_sd" \ + --class_data_dir="path-to-class-images" \ + --with_prior_preservation \ + --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --num_class_images=200 \ + --gradient_accumulation_steps=1 \ + --learning_rate=1e-4 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=800 \ + --mixed_precision=bf16 \ + --use_hpu_graphs_for_training \ + --use_hpu_graphs_for_inference \ + --gaudi_config_name Habana/stable-diffusion \ + lora --unet_r 8 --unet_alpha 8 ``` > [!NOTE] > When using PEFT method we can use a much higher learning rate compared to vanilla dreambooth. @@ -514,54 +543,70 @@ We can use the same `dog` dataset for the following examples. To launch Stable Diffusion XL LoRA training on a multi-card Gaudi system, use:" ```bash python train_dreambooth_lora_sdxl.py \ - --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \ - --instance_data_dir="dog" \ - --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \ - --output_dir="lora-trained-xl" \ - --mixed_precision="bf16" \ - --instance_prompt="a photo of sks dog" \ - --resolution=1024 \ - --train_batch_size=1 \ - --gradient_accumulation_steps=4 \ - --learning_rate=1e-4 \ - --lr_scheduler="constant" \ - --lr_warmup_steps=0 \ - --max_train_steps=500 \ - --validation_prompt="A photo of sks dog in a bucket" \ - --validation_epochs=25 \ - --seed=0 \ - --use_hpu_graphs_for_inference \ - --use_hpu_graphs_for_training \ - --gaudi_config_name Habana/stable-diffusion + --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \ + --instance_data_dir="dog" \ + --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \ + --output_dir="lora-trained-xl" \ + --mixed_precision="bf16" \ + --instance_prompt="a photo of sks dog" \ + --resolution=1024 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --learning_rate=1e-4 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=500 \ + --validation_prompt="A photo of sks dog in a bucket" \ + --validation_epochs=25 \ + --seed=0 \ + --use_hpu_graphs_for_inference \ + --use_hpu_graphs_for_training \ + --gaudi_config_name Habana/stable-diffusion ``` To launch Stable Diffusion XL LoRA training on a multi-card Gaudi system, use:" ```bash python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth_lora_sdxl.py \ - --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \ - --instance_data_dir="dog" \ - --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \ - --output_dir="lora-trained-xl" \ - --mixed_precision="bf16" \ - --instance_prompt="a photo of sks dog" \ - --resolution=1024 \ - --train_batch_size=1 \ - --gradient_accumulation_steps=4 \ - --learning_rate=1e-4 \ - --lr_scheduler="constant" \ - --lr_warmup_steps=0 \ - --max_train_steps=500 \ - --validation_prompt="A photo of sks dog in a bucket" \ - --validation_epochs=25 \ - --seed=0 \ - --use_hpu_graphs_for_inference \ - --use_hpu_graphs_for_training \ - --gaudi_config_name Habana/stable-diffusion + --pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" \ + --instance_data_dir="dog" \ + --pretrained_vae_model_name_or_path="madebyollin/sdxl-vae-fp16-fix" \ + --output_dir="lora-trained-xl" \ + --mixed_precision="bf16" \ + --instance_prompt="a photo of sks dog" \ + --resolution=1024 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --learning_rate=1e-4 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=500 \ + --validation_prompt="A photo of sks dog in a bucket" \ + --validation_epochs=25 \ + --seed=0 \ + --use_hpu_graphs_for_inference \ + --use_hpu_graphs_for_training \ + --gaudi_config_name Habana/stable-diffusion ``` > [!NOTE] > To use DeepSpeed instead of MPI, replace `--use_mpi` with `--deepspeed` in the previous example -After training completes, you can run inference with a simple python script like this: +After training is completed, you can directly use `text_to_image_generation.py` sample for inference, as shown below: +```bash +python ../text_to_image_generation.py \ + --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ + --lora_id lora-trained-xl \ + --prompts "A picture of a sks dog in a bucket" \ + --num_images_per_prompt 5 \ + --batch_size 1 \ + --image_save_dir /tmp/stable_diffusion_xl_images \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 +``` + +Alternatively, you can run inference with a simple Python script such as this: ```python import torch from optimum.habana import GaudiConfig @@ -573,6 +618,7 @@ pipe = GaudiStableDiffusionXLPipeline.from_pretrained( use_hpu_graphs=True, use_habana=True, gaudi_config="Habana/stable-diffusion", + sdp_on_bf16=True, ) pipe.load_lora_weights("lora-trained-xl") @@ -588,21 +634,6 @@ image = pipe( image.save("sdxl-lora.png") ``` -Alternatively, you could directly use `text_to_image_generation.py` sample for inference as follows: -```bash -python ../text_to_image_generation.py \ - --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \ - --lora_id lora-trained-xl \ - --prompts "A picture of a sks dog in a bucket" \ - --num_images_per_prompt 5 \ - --batch_size 1 \ - --image_save_dir /tmp/stable_diffusion_xl_images \ - --use_habana \ - --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion \ - --bf16 -``` - ### DreamBooth LoRA Fine-Tuning with FLUX.1-dev We can use the same `dog` dataset for the following examples. @@ -610,60 +641,76 @@ We can use the same `dog` dataset for the following examples. To launch FLUX.1-dev LoRA training on a single Gaudi card, use:" ```bash python train_dreambooth_lora_flux.py \ - --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \ - --dataset="dog" \ - --prompt="a photo of sks dog" \ - --output_dir="dog_lora_flux" \ - --mixed_precision="bf16" \ - --weighting_scheme="none" \ - --resolution=1024 \ - --train_batch_size=1 \ - --learning_rate=1e-4 \ - --guidance_scale=1 \ - --report_to="tensorboard" \ - --gradient_accumulation_steps=4 \ - --gradient_checkpointing \ - --lr_scheduler="constant" \ - --lr_warmup_steps=0 \ - --cache_latents \ - --rank=4 \ - --max_train_steps=500 \ - --seed="0" \ - --use_hpu_graphs_for_inference \ - --use_hpu_graphs_for_training \ - --gaudi_config_name="Habana/stable-diffusion" + --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \ + --dataset="dog" \ + --prompt="a photo of sks dog" \ + --output_dir="dog_lora_flux" \ + --mixed_precision="bf16" \ + --weighting_scheme="none" \ + --resolution=1024 \ + --train_batch_size=1 \ + --learning_rate=1e-4 \ + --guidance_scale=1 \ + --report_to="tensorboard" \ + --gradient_accumulation_steps=4 \ + --gradient_checkpointing \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --cache_latents \ + --rank=4 \ + --max_train_steps=500 \ + --seed="0" \ + --use_hpu_graphs_for_inference \ + --use_hpu_graphs_for_training \ + --gaudi_config_name="Habana/stable-diffusion" ``` To launch FLUX.1-dev LoRA training on a multi-card Gaudi system, use:" ```bash python ../../gaudi_spawn.py --world_size 8 --use_mpi train_dreambooth_lora_flux.py \ - --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \ - --dataset="dog" \ - --prompt="a photo of sks dog" \ - --output_dir="dog_lora_flux" \ - --mixed_precision="bf16" \ - --weighting_scheme="none" \ - --resolution=1024 \ - --train_batch_size=1 \ - --learning_rate=1e-4 \ - --guidance_scale=1 \ - --report_to="tensorboard" \ - --gradient_accumulation_steps=4 \ - --gradient_checkpointing \ - --lr_scheduler="constant" \ - --lr_warmup_steps=0 \ - --cache_latents \ - --rank=4 \ - --max_train_steps=500 \ - --seed="0" \ - --use_hpu_graphs_for_inference \ - --use_hpu_graphs_for_training \ - --gaudi_config_name="Habana/stable-diffusion" + --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \ + --dataset="dog" \ + --prompt="a photo of sks dog" \ + --output_dir="dog_lora_flux" \ + --mixed_precision="bf16" \ + --weighting_scheme="none" \ + --resolution=1024 \ + --train_batch_size=1 \ + --learning_rate=1e-4 \ + --guidance_scale=1 \ + --report_to="tensorboard" \ + --gradient_accumulation_steps=4 \ + --gradient_checkpointing \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --cache_latents \ + --rank=4 \ + --max_train_steps=500 \ + --seed="0" \ + --use_hpu_graphs_for_inference \ + --use_hpu_graphs_for_training \ + --gaudi_config_name="Habana/stable-diffusion" ``` > [!NOTE] > To use DeepSpeed instead of MPI, replace `--use_mpi` with `--use_deepspeed` in the previous example -After training completes, you can run inference on Gaudi system with a simple python script like this: +After training completes, you could directly use `text_to_image_generation.py` sample for inference as follows: +```bash +python ../text_to_image_generation.py \ + --model_name_or_path "black-forest-labs/FLUX.1-dev" \ + --lora_id dog_lora_flux \ + --prompts "A picture of a sks dog in a bucket" \ + --num_images_per_prompt 5 \ + --batch_size 1 \ + --image_save_dir /tmp/flux_images \ + --use_habana \ + --use_hpu_graphs \ + --gaudi_config Habana/stable-diffusion \ + --sdp_on_bf16 \ + --bf16 +``` + +Alternatively, you can run inference on Gaudi system with a simple Python script like this: ```python import torch from optimum.habana import GaudiConfig @@ -675,6 +722,7 @@ pipe = GaudiFluxPipeline.from_pretrained( use_hpu_graphs=True, use_habana=True, gaudi_config="Habana/stable-diffusion", + sdp_on_bf16=True, ) pipe.load_lora_weights("dog_lora_flux") @@ -688,19 +736,3 @@ image = pipe( ).images[0] image.save("flux-dev.png") ``` - -Alternatively, you could directly use `text_to_image_generation.py` sample for inference as follows: -```bash -python ../text_to_image_generation.py \ - --model_name_or_path "black-forest-labs/FLUX.1-dev" \ - --lora_id dog_lora_flux \ - --prompts "A picture of a sks dog in a bucket" \ - --num_images_per_prompt 5 \ - --batch_size 1 \ - --image_save_dir /tmp/flux_images \ - --use_habana \ - --use_hpu_graphs \ - --gaudi_config Habana/stable-diffusion \ - --sdp_on_bf16 \ - --bf16 -``` diff --git a/examples/stable-diffusion/training/download_train_datasets.py b/examples/stable-diffusion/training/download_train_datasets.py new file mode 100755 index 0000000000..6ff500c9ef --- /dev/null +++ b/examples/stable-diffusion/training/download_train_datasets.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +import shutil +from pathlib import Path + +from huggingface_hub import hf_hub_download, snapshot_download + + +# Download Cat-Toy example dataset +local_dir = "./cat" +snapshot_download( + repo_id="diffusers/cat_toy_example", + local_dir=local_dir, + repo_type="dataset", + ignore_patterns=".gitattributes", +) +cache_dir = Path(local_dir, ".cache") +if cache_dir.is_dir(): + shutil.rmtree(cache_dir) + +# Download Dog example dataset +local_dir = "./dog" +snapshot_download( + repo_id="diffusers/dog-example", + local_dir=local_dir, + repo_type="dataset", + ignore_patterns=".gitattributes", +) +cache_dir = Path(local_dir, ".cache") +if cache_dir.is_dir(): + shutil.rmtree(cache_dir) + +# Download ControlNet example images +local_dir = "./cnet" +file_path1 = hf_hub_download( + repo_id="huggingface/documentation-images", + subfolder="diffusers/controlnet_training", + filename="conditioning_image_1.png", + repo_type="dataset", + local_dir=local_dir, +) +file_path2 = hf_hub_download( + repo_id="huggingface/documentation-images", + subfolder="diffusers/controlnet_training", + filename="conditioning_image_2.png", + repo_type="dataset", + local_dir=local_dir, +) +shutil.move(file_path1, local_dir) +shutil.move(file_path2, local_dir) +cache_dir = Path(local_dir, ".cache") +if cache_dir.is_dir(): + shutil.rmtree(cache_dir) +sub_dir = Path(local_dir, "diffusers") +if sub_dir.is_dir(): + shutil.rmtree(sub_dir) diff --git a/examples/stable-diffusion/training/requirements.txt b/examples/stable-diffusion/training/requirements.txt index bf92040ae8..558217e643 100644 --- a/examples/stable-diffusion/training/requirements.txt +++ b/examples/stable-diffusion/training/requirements.txt @@ -1,3 +1,5 @@ imagesize peft == 0.10.0 sentencepiece +compel +datasets diff --git a/examples/stable-diffusion/training/textual_inversion.py b/examples/stable-diffusion/training/textual_inversion.py index db488f8749..2f465699b3 100755 --- a/examples/stable-diffusion/training/textual_inversion.py +++ b/examples/stable-diffusion/training/textual_inversion.py @@ -130,6 +130,7 @@ def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight use_habana=True, use_hpu_graphs=True, gaudi_config=args.gaudi_config_name, + sdp_on_bf16=args.sdp_on_bf16, ) pipeline.scheduler = GaudiDDIMScheduler.from_config(pipeline.scheduler.config) pipeline.set_progress_bar_config(disable=True) @@ -415,6 +416,9 @@ def parse_args(): default=None, help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.", ) + parser.add_argument( + "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--throughput_warmup_steps", type=int, @@ -883,7 +887,7 @@ def main(): htcore.mark_step() # Let's make sure we don't update any embedding weights besides the newly added token - index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool) + index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool, device=accelerator.device) index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False with torch.no_grad(): diff --git a/examples/stable-diffusion/training/textual_inversion_sdxl.py b/examples/stable-diffusion/training/textual_inversion_sdxl.py old mode 100644 new mode 100755 index 608ee481ad..3ab6c57602 --- a/examples/stable-diffusion/training/textual_inversion_sdxl.py +++ b/examples/stable-diffusion/training/textual_inversion_sdxl.py @@ -392,6 +392,9 @@ def parse_args(): default=None, help="Local path to the Gaudi configuration file or its name on the Hugging Face Hub.", ) + parser.add_argument( + "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend" + ) parser.add_argument( "--throughput_warmup_steps", type=int, @@ -623,6 +626,7 @@ def main(): use_habana=True, use_hpu_graphs=True, gaudi_config=args.gaudi_config_name, + sdp_on_bf16=args.sdp_on_bf16, ) text_encoder_1 = pipeline.text_encoder.to(accelerator.device) text_encoder_2 = pipeline.text_encoder_2.to(accelerator.device) @@ -918,9 +922,9 @@ def main(): htcore.mark_step() # Let's make sure we don't update any embedding weights besides the newly added token - index_no_updates = torch.ones((len(tokenizer_1),), dtype=torch.bool) + index_no_updates = torch.ones((len(tokenizer_1),), dtype=torch.bool, device=accelerator.device) index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False - index_no_updates_2 = torch.ones((len(tokenizer_2),), dtype=torch.bool) + index_no_updates_2 = torch.ones((len(tokenizer_2),), dtype=torch.bool, device=accelerator.device) index_no_updates_2[min(placeholder_token_ids_2) : max(placeholder_token_ids_2) + 1] = False with torch.no_grad(): diff --git a/examples/stable-diffusion/training/train_controlnet.py b/examples/stable-diffusion/training/train_controlnet.py index 004cee5af5..d7e8e33367 100755 --- a/examples/stable-diffusion/training/train_controlnet.py +++ b/examples/stable-diffusion/training/train_controlnet.py @@ -68,7 +68,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks. -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") if is_wandb_available(): import wandb diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_flux.py b/examples/stable-diffusion/training/train_dreambooth_lora_flux.py index 68b5320d19..1117d0a43f 100755 --- a/examples/stable-diffusion/training/train_dreambooth_lora_flux.py +++ b/examples/stable-diffusion/training/train_dreambooth_lora_flux.py @@ -784,7 +784,7 @@ def load_model_hook(models, input_dir): lora_state_dict = FluxPipeline.lora_state_dict(input_dir) transformer_state_dict = { - f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.") + f"{k.replace('transformer.', '')}": v for k, v in lora_state_dict.items() if k.startswith("transformer.") } transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict) incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default") diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py index b177cf12e6..4e96ee8e0d 100755 --- a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py +++ b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py @@ -94,7 +94,7 @@ def save_model_card( for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) img_str += f""" - - text: '{validation_prompt if validation_prompt else ' ' }' + - text: '{validation_prompt if validation_prompt else " "}' output: url: "image_{i}.png" @@ -1083,7 +1083,7 @@ def load_model_hook(models, input_dir): lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir) - unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")} + unet_state_dict = {f"{k.replace('unet.', '')}": v for k, v in lora_state_dict.items() if k.startswith("unet.")} unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict) incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default") if incompatible_keys is not None: diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py index b78c84bbe1..7bb96e51a1 100755 --- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py +++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py @@ -884,9 +884,9 @@ def main(args): # download the dataset. if args.dataset_name is not None: if len(args.mediapipe) > 0: - assert ( - args.resolution == args.crop_resolution - ), f"To use hardware pipe, --resolution ({args.resolution}) must equal --crop_resolution ({args.crop_resolution})" + assert args.resolution == args.crop_resolution, ( + f"To use hardware pipe, --resolution ({args.resolution}) must equal --crop_resolution ({args.crop_resolution})" + ) if args.local_rank == 0: if not os.path.exists(args.mediapipe): os.mkdir(args.mediapipe) @@ -1532,7 +1532,7 @@ def compute_time_ids(original_size, crops_coords_top_left): image_save_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Saving images in {image_save_dir.resolve()}...") for i, image in enumerate(images): - image.save(image_save_dir / f"image_{epoch}_{i+1}.png") + image.save(image_save_dir / f"image_{epoch}_{i + 1}.png") else: logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.") diff --git a/examples/stable-diffusion/unconditional_image_generation.py b/examples/stable-diffusion/unconditional_image_generation.py index bd70d0e4d6..ee49695910 100755 --- a/examples/stable-diffusion/unconditional_image_generation.py +++ b/examples/stable-diffusion/unconditional_image_generation.py @@ -20,7 +20,7 @@ def check_optimum_habana_min_version(*a, **b): check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") # Setup logging logging.basicConfig( diff --git a/examples/summarization/README.md b/examples/summarization/README.md index 745b293d69..bdaef78edf 100644 --- a/examples/summarization/README.md +++ b/examples/summarization/README.md @@ -179,65 +179,8 @@ python ../gaudi_spawn.py \ ## Using DeepSpeed -Here is an example on 8 HPUs on Gaudi2/Gaudi3 with DeepSpeed-ZeRO3 to fine-tune [FLAN-T5 XXL](https://huggingface.co/google/flan-t5-xxl): -```bash -PT_HPU_MAX_COMPOUND_OP_SIZE=512 python ../gaudi_spawn.py \ - --world_size 8 --use_deepspeed run_summarization.py \ - --model_name_or_path google/flan-t5-xxl \ - --do_train \ - --do_eval \ - --dataset_name cnn_dailymail \ - --dataset_config '"3.0.0"' \ - --source_prefix '"summarize: "' \ - --output_dir ./tst-summarization \ - --per_device_train_batch_size 22 \ - --per_device_eval_batch_size 22 \ - --learning_rate 1e-4 \ - --num_train_epochs 3 \ - --overwrite_output_dir \ - --predict_with_generate \ - --use_habana \ - --use_lazy_mode \ - --gaudi_config_name Habana/t5 \ - --ignore_pad_token_for_loss False \ - --pad_to_max_length \ - --generation_max_length 129 \ - --save_strategy epoch \ - --throughput_warmup_steps 3 \ - --gradient_checkpointing \ - --adam_epsilon 1e-08 --logging_steps 1 \ - --deepspeed ds_flan_t5_z3_config_bf16.json -``` - -Here is an example on 8 HPUs on Gaudi2 with DeepSpeed-ZeRO2 to fine-tune t5-large: -```bash -PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py \ - --world_size 8 \ - --use_deepspeed run_summarization.py \ - --deepspeed ../../tests/configs/deepspeed_zero_2.json \ - --do_train \ - --do_eval \ - --overwrite_output_dir \ - --predict_with_generate \ - --use_habana \ - --gaudi_config_name Habana/t5 \ - --ignore_pad_token_for_loss False \ - --pad_to_max_length \ - --save_strategy no \ - --throughput_warmup_steps 15 \ - --model_name_or_path t5-large \ - --source_prefix '"summarize:"' \ - --dataset_name cnn_dailymail \ - --dataset_config '"3.0.0"' \ - --output_dir /tmp/tst-summarization \ - --per_device_train_batch_size 20 \ - --per_device_eval_batch_size 20 \ - --max_train_samples 2000 \ - --torch_compile_backend hpu_backend \ - --torch_compile -``` - -You can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana. +You can check the [DeepSpeed](https://github.com/huggingface/optimum-habana/tree/main/examples#deepspeed) section in Optimum Habana examples for how to run DeepSpeed. +You also can look at the [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/deepspeed) for more information about how to use DeepSpeed in Optimum Habana. ## Inference @@ -262,26 +205,8 @@ python run_summarization.py \ --gaudi_config_name Habana/t5 \ --ignore_pad_token_for_loss False \ --pad_to_max_length \ + --throughput_warmup_steps 3 \ --bf16 \ --bf16_full_eval ``` -You can run inference with BART on the CNN-DailyMail dataset on 1 Gaudi card with the following command: -```bash -python run_summarization.py \ - --model_name_or_path facebook/bart-large-cnn \ - --do_predict \ - --dataset_name cnn_dailymail \ - --dataset_config "3.0.0" \ - --output_dir /tmp/tst-summarization \ - --per_device_eval_batch_size 2 \ - --overwrite_output_dir \ - --predict_with_generate \ - --use_habana \ - --use_lazy_mode \ - --use_hpu_graphs_for_inference \ - --gaudi_config_name Habana/bart \ - --ignore_pad_token_for_loss False \ - --pad_to_max_length \ - --num_beams 1 -``` diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py index 65755d24a2..4accefa95d 100755 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -66,7 +66,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") @@ -559,9 +559,9 @@ def main(): return if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): - assert ( - data_args.lang is not None - ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument" + assert data_args.lang is not None, ( + f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument" + ) tokenizer.src_lang = data_args.lang tokenizer.tgt_lang = data_args.lang diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py index 68f5e9a2aa..e4388b1fde 100755 --- a/examples/text-classification/run_glue.py +++ b/examples/text-classification/run_glue.py @@ -58,7 +58,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") @@ -168,9 +168,9 @@ def __post_init__(self): train_extension = self.train_file.split(".")[-1] assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." validation_extension = self.validation_file.split(".")[-1] - assert ( - validation_extension == train_extension - ), "`validation_file` should have the same extension (csv or json) as `train_file`." + assert validation_extension == train_extension, ( + "`validation_file` should have the same extension (csv or json) as `train_file`." + ) @dataclass @@ -338,9 +338,9 @@ def main(): if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] - assert ( - test_extension == train_extension - ), "`test_file` should have the same extension (csv or json) as `train_file`." + assert test_extension == train_extension, ( + "`test_file` should have the same extension (csv or json) as `train_file`." + ) data_files["test"] = data_args.test_file else: raise ValueError("Need either a GLUE task or a test file for `do_predict`.") diff --git a/examples/text-feature-extraction/README.md b/examples/text-feature-extraction/README.md index 2b0d5354ef..e46168840b 100644 --- a/examples/text-feature-extraction/README.md +++ b/examples/text-feature-extraction/README.md @@ -31,10 +31,3 @@ python run_feature_extraction.py \ --sdp_on_bf16 \ --bf16 ``` - -Models that have been validated: - -- [Supabase/gte-small](https://huggingface.co/Supabase/gte-small) -- [thenlper/gte-small](https://huggingface.co/thenlper/gte-small) -- [thenlper/gte-base](https://huggingface.co/thenlper/gte-base) -- [thenlper/gte-large](https://huggingface.co/thenlper/gte-large) diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 7767443c6e..7fa3e5ca70 100755 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -33,7 +33,7 @@ pip install -r requirements_lm_eval.txt Then, if you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html) (e.g. to use BLOOM/BLOOMZ), you should install DeepSpeed as follows: ```bash -pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 ``` @@ -132,6 +132,7 @@ Here are a few settings you may be interested in: - `--prompt` to benchmark the model on one or several prompts of your choice - `--attn_softmax_bf16` to run attention softmax layer in bfloat16 precision provided that the model (such as Llama) supports it - `--trim_logits` to calculate logits only for the last token in the first time step provided that the model (such as Llama) supports it +- `--attn_batch_split` specifies the number of smaller batches into which attention and MLP processing are split to improve parallelization. By default, no splitting is performed (value is 1). Splitting is enabled only for prompt processing. This configuration is most effective for batch sizes (BS) > 125 and tensor parallelism (TP) >= 2, with a recommended value of '3' splits. For example, you can reproduce the results presented in [this blog post](https://huggingface.co/blog/habana-gaudi-2-bloom) with the following command: ```bash @@ -189,6 +190,7 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ To run Llama3-405B inference on 8 Gaudi3 cards use the following command: ```bash +ENABLE_LB_BUNDLE_ALL_COMPUTE_MME=0 ENABLE_EXPERIMENTAL_FLAGS=1 \ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --model_name_or_path meta-llama/Llama-3.1-405B-Instruct \ --max_new_tokens 2048 \ @@ -201,6 +203,21 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --flash_attention_causal_mask ``` + +To run Deepseek-R1-BF16 inference on 16 Gaudi3 cards (2 nodes) use the following command. Ensure you replace the hostfile parameter with the appropriate file. Sample hostfile reference [here](https://github.com/huggingface/optimum-habana/blob/main/examples/multi-node-training/hostfile) +```bash +python3 ../gaudi_spawn.py --hostfile= --use_deepspeed \ +--world_size 16 ./run_generation.py \ +--model_name_or_path opensourcerelease/DeepSeek-R1-bf16 \ +--bf16 \ +--trim_logits \ +--batch_size 1 \ +--use_hpu_graphs \ +--use_kv_cache \ +--parallel_strategy "ep" \ +--prompt "DeepSpeed is a machine learning framework" +``` + > To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should: > - have a HF account > - agree to the terms of use of the model in its model card on the HF Hub @@ -236,7 +253,8 @@ python run_generation.py \ --dataset_name JulesBelveze/tldr_news \ --column_name content \ --bf16 \ ---sdp_on_bf16 +--sdp_on_bf16 \ +--trust_remote_code ``` > The prompt length is limited to 16 tokens. Prompts longer than this will be truncated. @@ -597,7 +615,7 @@ Some models can fit on HPU DRAM but can't fit on the CPU RAM. When we run a model on single card and don't use deepspeed, the `--disk_offload` flag allows to offload weights to disk during model quantization in INC. When this flag is mentioned, during the quantization process, each weight first is loaded from disk to CPU RAM, when brought to HPU DRAM and quantized there. This way not all the model is on the CPU RAM but only one weight each time. To enable this weights offload mechanism, add `--disk_offload` flag to the topology command line. Here is an example of using disk_offload in quantize command. -Please follow the "Running FP8 models on single device" section first before running the cmd below. +Please follow the [Running FP8 models on single device](#running-fp8-models-on-single-device) section first before running the cmd below. ```bash QUANT_CONFIG=./quantization_config/maxabs_quant.json TQDM_DISABLE=1 \ @@ -619,6 +637,57 @@ python run_generation.py \ --flash_attention_recompute ``` +### Saving FP8 Checkpoints in Hugging Face format +After quantizing the model, we can save it to a local path. + +> [!NOTE] +> Before executing the command below, please refer to the [Running with FP8](#running-with-fp8) section to measure the model quantization statistics. + +Here is an example of how to quantize and save the LLama3.1-70B model on two cards: +```bash +QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \ +--use_deepspeed --world_size 2 run_generation.py \ +--model_name_or_path meta-llama/Llama-3.1-70B \ +--attn_softmax_bf16 \ +--use_hpu_graphs \ +--trim_logits \ +--use_kv_cache \ +--reuse_cache \ +--use_flash_attention \ +--flash_attention_recompute \ +--bf16 \ +--batch_size 1 \ +--max_new_tokens 128 \ +--max_input_tokens 128 \ +--limit_hpu_graphs \ +--save_quantized_model_with_inc \ +--saved_model_path +``` + +> [!NOTE] +> For multi-card usage, the number of cards loaded and used needs to be kept consistent with that when saving. + +### Loading FP8 Checkpoints from Hugging Face +You can load pre-quantized FP8 models using the `--load_quantized_model_with_inc` argument. The `model_name_or_path` should be a model name from [Neural Magic](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) or a path to FP8 Checkpoints saved in Hugging Face format. + +Below is an example of how to load `neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8` on two cards. +```bash +python ../gaudi_spawn.py \ +--use_deepspeed --world_size 2 run_lm_eval.py \ +-o acc_load_fp8_model.txt \ +--model_name_or_path neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8 \ +--use_hpu_graphs \ +--use_kv_cache \ +--trim_logits \ +--batch_size 1 \ +--bf16 \ +--use_flash_attention \ +--flash_attention_recompute \ +--attn_softmax_bf16 \ +--bucket_size=128 \ +--bucket_internal \ +--load_quantized_model_with_inc +``` ### Loading 4 Bit Checkpoints from Hugging Face @@ -727,6 +796,36 @@ python run_generation.py \ --load_quantized_model_with_autogptq ``` +### Running with UINT4 weight quantization using AutoAWQ + +Llama2-7b supports UINT4 weight-only quantization through [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), which offers quantization capabilities in PyTorch. +Currently, this support is limited to UINT4 inference of pre-quantized models only. + +Please run the following command to install AutoAWQ: +```bash +pip install -r requirements_awq.txt +``` + +You can run a *UINT4 weight quantized* model using AutoAWQ by including the argument `--load_quantized_model_with_autoawq`. + +Here is an example of how to run a quantized model : +```bash +python run_generation.py \ +--attn_softmax_bf16 \ +--model_name_or_path \ +--use_hpu_graphs \ +--limit_hpu_graphs \ +--use_kv_cache \ +--bucket_size 128 \ +--bucket_internal \ +--trim_logits \ +--max_new_tokens 128 \ +--batch_size 1 \ +--bf16 \ +--load_quantized_model_with_autoawq +``` + + ## Language Model Evaluation Harness The evaluation of LLMs can be done using the `lm_eval.py` script. It utilizes the [LM evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness) @@ -746,8 +845,15 @@ pip install -r requirements_lm_eval.txt ``` > [!NOTE] +> Please add the flags for following models to improve accuracy when using lm_eval on gaudi2. Please note this is a workaround for 1.20 release only. +> +> ENABLE_LB_BUNDLE_ALL_COMPUTE_MME=0 COMPLEXGUID_DISABLE_RMS_NORM=true ENABLE_EXPERIMENTAL_FLAGS=true for llama-2-70b-hf[PTQ fp8] +> +> COMPLEXGUID_DISABLE_RMS_NORM=true ENABLE_EXPERIMENTAL_FLAGS=true for Llama-3.1-70B-Instruct[PTQ fp8] and llama-2-70b-hf[bf16] +> > If custom models on hub is being used, please set env variable HF_DATASETS_TRUST_REMOTE_CODE=true instead of arg --trust_remote_code with the installed lm_eval version and dependency datasets==2.21.0 + ### Examples Evaluate Llama 7B on Gaudi on task PiQA, using the BF16 data type: diff --git a/examples/text-generation/quantization_config/maxabs_quant_mixtral.json b/examples/text-generation/quantization_config/maxabs_quant_mixtral.json index 87dc52d08a..caaff8d09e 100644 --- a/examples/text-generation/quantization_config/maxabs_quant_mixtral.json +++ b/examples/text-generation/quantization_config/maxabs_quant_mixtral.json @@ -3,10 +3,7 @@ "mode": "QUANTIZE", "observer": "maxabs", "scale_method": "maxabs_hw", - "allowlist": {"types": [], "names": ["gate","w1","w3","w2"]}, - "blocklist": {"types": [], "names": [ - "model.layers.1.block_sparse_moe.experts.(3|4).w2", - "model.layers.[29-31].block_sparse_moe.experts.[0-7].w2" - ]}, + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": ["self_attn"]}, "dump_stats_path": "./hqt_output/measure" } \ No newline at end of file diff --git a/examples/text-generation/quantization_config/pow2_quant.json b/examples/text-generation/quantization_config/pow2_quant.json new file mode 100644 index 0000000000..e1f2eb1c6e --- /dev/null +++ b/examples/text-generation/quantization_config/pow2_quant.json @@ -0,0 +1,7 @@ +{ + "method": "HOOKS", + "mode": "QUANTIZE", + "observer": "maxabs", + "scale_method": "maxabs_pow2", + "dump_stats_path": "./hqt_output/measure" +} diff --git a/examples/text-generation/quantization_config/weight_opt_quant.json b/examples/text-generation/quantization_config/weight_opt_quant.json new file mode 100644 index 0000000000..1ec2dc6b6a --- /dev/null +++ b/examples/text-generation/quantization_config/weight_opt_quant.json @@ -0,0 +1,7 @@ +{ + "method": "HOOKS", + "mode": "QUANTIZE", + "observer": "maxabs", + "scale_method": "maxabs_hw_opt_weight", + "dump_stats_path": "./hqt_output/measure" +} diff --git a/examples/text-generation/requirements_awq.txt b/examples/text-generation/requirements_awq.txt new file mode 100644 index 0000000000..5632195c99 --- /dev/null +++ b/examples/text-generation/requirements_awq.txt @@ -0,0 +1,2 @@ +triton==3.1.0 +autoawq diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index ef2252a989..ef5b198b9f 100755 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -29,7 +29,7 @@ import torch from transformers import BatchEncoding -from utils import adjust_batch, count_hpu_graphs, finalize_quantization, initialize_model +from utils import adjust_batch, count_hpu_graphs, finalize_quantization, initialize_model, save_model from optimum.habana.utils import get_hpu_memory_stats @@ -226,6 +226,11 @@ def setup_parser(parser): action="store_true", help="Skip HPU Graph usage for first token to save memory", ) + parser.add_argument( + "--clear_hpu_graphs_cache", + action="store_true", + help="Clear HPU graphs cache", + ) parser.add_argument( "--show_graphs_count", action="store_true", @@ -323,6 +328,17 @@ def setup_parser(parser): parser.add_argument( "--sdp_on_bf16", action="store_true", help="Allow pyTorch to use reduced precision in the SDPA math backend" ) + parser.add_argument( + "--save_quantized_model_with_inc", + action="store_true", + help="Save quantized Huggingface checkpoint using INC.", + ) + parser.add_argument( + "--saved_model_path", + type=str, + default="inc_quantized_model", + help="A path to save quantized checkpoint.", + ) quant_parser_group = parser.add_mutually_exclusive_group() quant_parser_group.add_argument( @@ -330,6 +346,11 @@ def setup_parser(parser): action="store_true", help="Load an AutoGPTQ quantized checkpoint using AutoGPTQ.", ) + quant_parser_group.add_argument( + "--load_quantized_model_with_autoawq", + action="store_true", + help="Load an AutoAWQ quantized checkpoint using AutoAWQ.", + ) quant_parser_group.add_argument( "--disk_offload", action="store_true", @@ -338,7 +359,7 @@ def setup_parser(parser): quant_parser_group.add_argument( "--load_quantized_model_with_inc", action="store_true", - help="Load a Huggingface quantized checkpoint using INC.", + help="Load a quantized Huggingface checkpoint using INC.", ) quant_parser_group.add_argument( "--local_quantized_inc_model_path", @@ -346,6 +367,12 @@ def setup_parser(parser): default=None, help="Path to neural-compressor quantized model, if set, the checkpoint will be loaded.", ) + parser.add_argument( + "--attn_batch_split", + default=1, + type=int, + help="Specify the batch size split for attention and mlp layers. 1 for no split. This is enabled only for prompt.", + ) args = parser.parse_args() @@ -361,6 +388,8 @@ def setup_parser(parser): args.quant_config = os.getenv("QUANT_CONFIG", "") if args.quant_config and args.load_quantized_model_with_autogptq: raise RuntimeError("Setting both quant_config and load_quantized_model_with_autogptq is unsupported. ") + if args.quant_config and args.load_quantized_model_with_autoawq: + raise RuntimeError("Setting both quant_config and load_quantized_model_with_autoawq is unsupported. ") if args.quant_config == "" and args.disk_offload: logger.warning( @@ -526,7 +555,7 @@ def compute_valid_sequence_lengths_tensor(input_tokens): profiling_record_shapes=args.profiling_record_shapes, ).cpu() first_token_time = iteration_times[0] + encode_duration - logger.info(f"Time to first token = {first_token_time*1000}ms") + logger.info(f"Time to first token = {first_token_time * 1000}ms") return tokenizer.batch_decode(outputs, skip_special_tokens=True) from optimum.habana.utils import HabanaProfile @@ -541,10 +570,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens): if dyn_prompt_lens is None or len(set(dyn_prompt_lens)) == 1: for i in range(args.warmup): if dyn_prompt_lens is None: - print(f"Warming up iteration {i+1}/{args.warmup}", flush=True) + print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True) generate(None, args.reduce_recompile) else: - print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i+1}/{args.warmup}", flush=True) + print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True) generate(dyn_prompt_lens[0], args.reduce_recompile) else: if args.bucket_size > 0: @@ -559,7 +588,7 @@ def rounder(x): for i in range(args.warmup): lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size)) for sz in lst: - print(f"Warming up for shape {sz - 1} iteration {i+1}/{args.warmup}", flush=True) + print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True) generate(sz - 1, args.reduce_recompile) torch_hpu.synchronize() compilation_duration = time.perf_counter() - t0 @@ -586,12 +615,12 @@ def rounder(x): all_inputs = [] all_outputs = [] for i, input_sentence in enumerate(zip(input_sentences)): - print(f"input {i+1}: {input_sentence}") + print(f"input {i + 1}: {input_sentence}") all_inputs.append(input_sentence) for j, output in enumerate( zip(generated[args.num_return_sequences * i : args.num_return_sequences * (i + 1)]) ): - print(f"output {i+1}.{j+1}: {output}") + print(f"output {i + 1}.{j + 1}: {output}") all_outputs.append(output) print() @@ -630,7 +659,7 @@ def rounder(x): assert not args.simulate_dyn_prompt, "Both dataset_name and simulate_dyn_prompt are set" - raw_dataset = load_dataset(args.dataset_name) + raw_dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code) if "test" in raw_dataset: split = "test" elif "validation" in raw_dataset: @@ -719,22 +748,21 @@ def generate_dataset(batch): return prompt, outputs # warmup - if prompt_length > 0: - from optimum.habana.utils import HabanaProfile + from optimum.habana.utils import HabanaProfile - # compilation stage disable profiling - HabanaProfile.disable() - # Compilation - logger.info("Graph compilation...") - t0 = time.perf_counter() - for i, batch in enumerate(dataloader): - generate_dataset(batch) - # The first three iterations take longer because of graph compilation - if (i + 1) == 3: - break - torch_hpu.synchronize() - compilation_duration = time.perf_counter() - t0 - HabanaProfile.enable() + # compilation stage disable profiling + HabanaProfile.disable() + # Compilation + logger.info("Graph compilation...") + t0 = time.perf_counter() + for i, batch in enumerate(dataloader): + generate_dataset(batch) + # The first three iterations take longer because of graph compilation + if (i + 1) == 3: + break + torch_hpu.synchronize() + compilation_duration = time.perf_counter() - t0 + HabanaProfile.enable() total_new_tokens_generated = 0 duration = 0 @@ -747,10 +775,10 @@ def generate_dataset(batch): duration += time.perf_counter() - t0 total_new_tokens_generated += args.batch_size * args.max_new_tokens print(separator) - print(f"Batch nĀ°{i+1}") - print(f"Input: {prompt[:args.batch_size]}") + print(f"Batch nĀ°{i + 1}") + print(f"Input: {prompt[: args.batch_size]}") print( - f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[:args.batch_size*args.num_return_sequences]}" + f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[: args.batch_size * args.num_return_sequences]}" ) print(separator) if args.run_partial_dataset and args.n_iterations == i + 1: @@ -770,11 +798,12 @@ def generate_dataset(batch): mem = get_hpu_memory_stats() for k, v in mem.items(): print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v)) - if prompt_length > 0: - print(f"Graph compilation duration = {compilation_duration} seconds") + print(f"Graph compilation duration = {compilation_duration} seconds") print(separator) if args.quant_config: finalize_quantization(model) + if args.save_quantized_model_with_inc: + save_model(model, tokenizer, args.saved_model_path) if args.const_serialization_path and os.path.isdir(args.const_serialization_path): import shutil diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py index 152d83ec93..88fcca6345 100644 --- a/examples/text-generation/run_lm_eval.py +++ b/examples/text-generation/run_lm_eval.py @@ -14,7 +14,7 @@ # limitations under the License. ############################################################################### -# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company +# Copyright (C) 2020-2025 Habana Labs, Ltd. an Intel Company ############################################################################### import argparse @@ -32,7 +32,7 @@ # Local imports from run_generation import setup_parser -from utils import finalize_quantization, initialize_model +from utils import finalize_quantization, initialize_model, save_model from optimum.habana.utils import get_hpu_memory_stats @@ -86,6 +86,13 @@ def setup_lm_eval_parser(): default=["hellaswag", "lambada_openai", "piqa", "winogrande"], ) parser.add_argument("--limit_iters", type=int, help="limit examples to run that many iterations", default=None) + parser.add_argument( + "--show_config", + action="store_true", + default=False, + help="If True, shows the the full config of all tasks at the end of the evaluation.", + ) + parser.add_argument("--max_graphs", type=int, help="Maximum number of HPU graphs", default=None) args = setup_parser(parser) return args @@ -118,8 +125,18 @@ def __init__(self, tokenizer, model, args, options): "reuse_cache": self.options.reuse_cache, } ) - if self.model.config.model_type in ["llama", "mistral", "qwen2", "falcon", "starcoder2", "gemma", "baichuan"]: - if self.model.config.model_type != "falcon": + + if self.model.config.model_type in [ + "llama", + "mistral", + "qwen2", + "falcon", + "starcoder2", + "gemma", + "baichuan", + "gpt_bigcode", + ]: + if self.model.config.model_type not in ["falcon", "gpt_bigcode"]: self.model_inputs.update( { "attn_softmax_bf16": self.options.attn_softmax_bf16, @@ -132,6 +149,8 @@ def __init__(self, tokenizer, model, args, options): "flash_attention_causal_mask": self.options.flash_attention_causal_mask, } ) + if self.model.config.model_type in ["llama", "qwen2", "baichuan", "gpt_bigcode"]: + self.model_inputs.update({"flash_attention_fast_softmax": self.options.flash_attention_fast_softmax}) if args.warmup: self.warm_up() @@ -191,6 +210,36 @@ def _model_call(self, inps): logits = logits.to(torch.float32) return logits + def get_model_info(self) -> dict: + """ + Patched method to get Hugging Face model information for experiment reproducibility. + source: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.7/lm_eval/models/huggingface.py/#L1375 + Remove from SynapseAI 1.21 + """ + + def get_model_num_params(model) -> int: + if hasattr(model, "num_parameters"): + return model.num_parameters() + elif hasattr(model, "parameters"): + return sum(p.numel() for p in model.parameters()) + else: + return -1 + + def get_model_dtype(model) -> str: + if hasattr(model, "dtype"): + return model.dtype + elif hasattr(model, "parameters"): + return next(model.parameters()).dtype + else: + return "" + + model_info = { + "model_num_parameters": get_model_num_params(self._model), + "model_dtype": get_model_dtype(self._model), + "model_revision": self.revision, + } + return model_info + def main(): args = setup_lm_eval_parser() @@ -226,9 +275,12 @@ def main(): for k, v in mem.items(): print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v)) json.dump(results, open(args.output_file, "w"), indent=2) - print(json.dumps(results, indent=2)) + if args.show_config: + print(json.dumps(results, indent=2)) if args.quant_config: finalize_quantization(model) + if args.save_quantized_model_with_inc: + save_model(model, tokenizer, args.saved_model_path) if args.const_serialization_path and os.path.isdir(args.const_serialization_path): import shutil diff --git a/examples/text-generation/text-generation-pipeline/README.md b/examples/text-generation/text-generation-pipeline/README.md index 2aa036ec3a..ec28462501 100644 --- a/examples/text-generation/text-generation-pipeline/README.md +++ b/examples/text-generation/text-generation-pipeline/README.md @@ -22,7 +22,7 @@ The text-generation pipeline can be used to perform text-generation by providing If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows: ```bash -pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.19.0 +pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.20.0 ``` If you would like to use the pipeline with LangChain classes, you can install LangChain as follows: diff --git a/examples/text-generation/text-generation-pipeline/run_pipeline.py b/examples/text-generation/text-generation-pipeline/run_pipeline.py index 43aea65cec..11e542d7a5 100644 --- a/examples/text-generation/text-generation-pipeline/run_pipeline.py +++ b/examples/text-generation/text-generation-pipeline/run_pipeline.py @@ -45,14 +45,14 @@ def main(): duration = 0 for iteration in range(args.n_iterations): - logger.info(f"Running inference iteration {iteration+1}...") + logger.info(f"Running inference iteration {iteration + 1}...") t0 = time.perf_counter() output = pipe(input_sentences) duration += time.perf_counter() - t0 for i, (input_sentence, generated_text) in enumerate(zip(input_sentences, output)): - print(f"Prompt[{iteration+1}][{i+1}]: {input_sentence}") - print(f"Generated Text[{iteration+1}][{i+1}]: {repr(generated_text)}\n") + print(f"Prompt[{iteration + 1}][{i + 1}]: {input_sentence}") + print(f"Generated Text[{iteration + 1}][{i + 1}]: {repr(generated_text)}\n") throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds") diff --git a/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py b/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py index 556494cd37..6212e808aa 100644 --- a/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py +++ b/examples/text-generation/text-generation-pipeline/run_pipeline_langchain.py @@ -87,8 +87,8 @@ def main(): duration += time.perf_counter() - t0 for i, (question, answer) in enumerate(zip(input_questions, responses)): - print(f"Question[{iteration+1}][{i+1}]: {question['question']}") - print(f"Response[{iteration+1}][{i+1}]: {answer}\n") + print(f"Question[{iteration + 1}][{i + 1}]: {question['question']}") + print(f"Response[{iteration + 1}][{i + 1}]: {answer}\n") throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds") diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 6a576799e2..f1829bf3b5 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -130,8 +130,8 @@ def setup_const_serialization(const_serialization_path): def setup_env(args): # Will error if the minimal version of Transformers is not installed. Remove at your own risks. - check_min_version("4.34.0") - check_optimum_habana_min_version("1.9.0.dev0") + check_min_version("4.45.0") + check_optimum_habana_min_version("1.17.0.dev0") # TODO: SW-167588 - WA for memory issue in hqt prep_model os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") @@ -158,7 +158,7 @@ def setup_device(args): if args.device == "hpu": import habana_frameworks.torch.core as htcore - if args.quant_config: + if args.quant_config or args.load_quantized_model_with_inc or args.local_quantized_inc_model_path: htcore.hpu_set_env() return torch.device(args.device) @@ -252,7 +252,20 @@ def setup_model(args, model_dtype, model_kwargs, logger): model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs ) + elif args.load_quantized_model_with_autoawq: + from transformers import AwqConfig + + quantization_config = AwqConfig(bits=4, version="hpu") + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, quantization_config=quantization_config, **model_kwargs + ) elif args.load_quantized_model_with_inc: + # TODO: This will be removed in v1.20 Synapse release + # Override neural_compressor split_rank_state_dict for loading neural_magic models on multi-cards. + import neural_compressor.torch.algorithms.fp8_quant.save_load as nc_sl + + nc_sl.split_rank_state_dict = local_split_rank_state_dict + from neural_compressor.torch.quantization import load model = load(model_name_or_path=args.model_name_or_path, format="huggingface", device="hpu", **model_kwargs) @@ -297,7 +310,8 @@ def setup_model(args, model_dtype, model_kwargs, logger): if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": model = wrap_in_hpu_graph(model, hash_with_views=False) else: - model = wrap_in_hpu_graph(model) + max_graphs = getattr(args, "max_graphs", None) + model = wrap_in_hpu_graph(model, max_graphs=max_graphs) if args.assistant_model is not None: assistant_model = wrap_in_hpu_graph(assistant_model) if _is_peft_model(model): @@ -307,6 +321,9 @@ def setup_model(args, model_dtype, model_kwargs, logger): if args.torch_compile: model = get_torch_compiled_model(model, logger) + assert "PT_HPU_LAZY_MODE" in os.environ and os.environ["PT_HPU_LAZY_MODE"] == "0", ( + "Please set PT_HPU_LAZY_MODE=0 on command line when using `--torch_compile`" + ) # if args.assistant_model is not None: # assistant_model = get_torch_compiled_model(assistant_model, logger) return model, assistant_model @@ -610,6 +627,12 @@ def setup_tokenizer(args, model, assistant_model, logger): ) model.generation_config.eos_token_id = model.generation_config.eos_token_id[-1] + if model.config.model_type == "mpt": + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + if model.generation_config.pad_token_id is None: + model.generation_config.pad_token_id = tokenizer.eos_token_id + # Some models like GPT2 do not have a PAD token so we have to set it if necessary if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token @@ -647,6 +670,7 @@ def setup_generation_config(args, model, assistant_model, tokenizer): generation_config.trim_logits = args.trim_logits generation_config.attn_softmax_bf16 = args.attn_softmax_bf16 generation_config.limit_hpu_graphs = args.limit_hpu_graphs + generation_config.clear_hpu_graphs_cache = args.clear_hpu_graphs_cache generation_config.reuse_cache = args.reuse_cache generation_config.reduce_recompile = args.reduce_recompile if generation_config.reduce_recompile: @@ -657,6 +681,7 @@ def setup_generation_config(args, model, assistant_model, tokenizer): generation_config.flash_attention_fast_softmax = args.flash_attention_fast_softmax generation_config.trust_remote_code = args.trust_remote_code generation_config.valid_sequence_lengths = None + generation_config.attn_batch_split = args.attn_batch_split return generation_config @@ -667,7 +692,7 @@ def exclude_hpu_graph_configs(args): if "falcon-180B" in args.model_name_or_path or "falcon-180b" in args.model_name_or_path: return False if args.world_size == 2 or args.world_size == 4 or args.world_size == 8: - if args.quant_config: + if args.quant_config or args.load_quantized_model_with_inc or args.local_quantized_inc_model_path: if args.max_input_tokens >= 8192 and args.max_new_tokens >= 128: return False else: @@ -681,6 +706,9 @@ def exclude_hpu_graph_configs(args): def initialize_model(args, logger): init_start = time.perf_counter() setup_distributed(args) + if not args.world_size > 0 and args.attn_batch_split > 1: + logger.warning("Disabling attention batch splitting as it's unnecessary for single-card execution") + args.attn_batch_split = 1 if exclude_hpu_graph_configs(args): args.limit_hpu_graphs = False override_prints(args.global_rank == 0 or args.verbose_workers, logger) @@ -710,7 +738,7 @@ def initialize_model(args, logger): model, assistant_model = ( setup_model(args, model_dtype, model_kwargs, logger) - if not use_deepspeed + if not use_deepspeed or args.load_quantized_model_with_inc else setup_distributed_model(args, model_dtype, model_kwargs, logger) if args.parallel_strategy == "none" else setup_distributed_model_tp(args, model_dtype, model_kwargs, logger, cache_dir) @@ -723,10 +751,48 @@ def initialize_model(args, logger): if args.const_serialization_path: setup_const_serialization(args.const_serialization_path) - if args.quant_config: + if args.quant_config or args.load_quantized_model_with_inc or args.local_quantized_inc_model_path: model = setup_inference(args, model) init_end = time.perf_counter() logger.info(f"Args: {args}") logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}") logger.info(f"Model initialization took {(init_end - init_start):.3f}s") return model, assistant_model, tokenizer, generation_config + + +def save_model(model, tokenizer, save_path): + """Saves the model and tokenizer in the huggingface format with neural_compressor.""" + from neural_compressor.torch.quantization import save + + save(model, save_path, format="huggingface") + tokenizer.save_pretrained(save_path) + + +# TODO: This will be removed in v1.20 Synapse release +# Override neural_compressor split_rank_state_dict for loading neural_magic models on multi-cards. +def local_split_rank_state_dict(model, gathered_state_dict): + """split state_dict for current local_rank.""" + from neural_compressor.torch.algorithms.fp8_quant.save_load import ( + cur_accelerator, + local_rank, + split_weights, + world_size, + ) + + rank_state_dict = {} + for name, param in model.named_parameters(): + if name in gathered_state_dict: + full_weight = gathered_state_dict[name] + if len(param.shape) != 0 and full_weight.shape != param.shape: + if full_weight.shape[0] != param.shape[0]: + split_weight = split_weights(full_weight, world_size, local_rank, split_axis=0).clone() + elif full_weight.shape[1] != param.shape[1]: + split_weight = split_weights(full_weight, world_size, local_rank, split_axis=1).clone() + else: + split_weight = split_weights(full_weight, world_size, local_rank, split_axis=0).clone() + else: + split_weight = full_weight + rank_state_dict[name] = split_weight + cur_accelerator.synchronize() + + return rank_state_dict diff --git a/examples/text-to-speech/README.md b/examples/text-to-speech/README.md index a1e089f55e..21070d275f 100644 --- a/examples/text-to-speech/README.md +++ b/examples/text-to-speech/README.md @@ -33,8 +33,4 @@ python3 run_pipeline.py \ --text "Hello, my dog is cooler than you!" \ --use_hpu_graphs \ --bf16 -``` -Models that have been validated: - - [microsoft/speecht5_tts](https://huggingface.co/microsoft/speecht5_tts) - - [facebook/hf-seamless-m4t-medium](https://huggingface.co/facebook/hf-seamless-m4t-medium) - - [facebook/mms-tts-eng](https://huggingface.co/facebook/mms-tts-eng) +``` \ No newline at end of file diff --git a/examples/text-to-speech/requirements.txt b/examples/text-to-speech/requirements.txt index c5fb09c806..01d3da67aa 100644 --- a/examples/text-to-speech/requirements.txt +++ b/examples/text-to-speech/requirements.txt @@ -1,2 +1,3 @@ datasets soundfile +sentencepiece diff --git a/examples/text-to-speech/run_pipeline.py b/examples/text-to-speech/run_pipeline.py index 1d9b53de7d..81546b0cb9 100644 --- a/examples/text-to-speech/run_pipeline.py +++ b/examples/text-to-speech/run_pipeline.py @@ -129,7 +129,7 @@ def main(): text, batch_size=args.batch_size, forward_params=forward_params, generate_kwargs=generate_kwargs ) end = time.time() - logger.info(f"speech = {speech} time = {(end-start) * 1000 / args.n_iterations }ms") + logger.info(f"speech = {speech} time = {(end - start) * 1000 / args.n_iterations}ms") sf.write("speech.wav", speech[0]["audio"].squeeze(), samplerate=speech[0]["sampling_rate"]) diff --git a/examples/text-to-video/requirements.txt b/examples/text-to-video/requirements.txt deleted file mode 100644 index 6ab6d0d570..0000000000 --- a/examples/text-to-video/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -opencv-python-headless diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py index 6f55ae1350..8a3ca0b0c4 100644 --- a/examples/translation/run_translation.py +++ b/examples/translation/run_translation.py @@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b): # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks. check_min_version("4.45.0") -check_optimum_habana_min_version("1.16.0.dev0") +check_optimum_habana_min_version("1.17.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/trl/README.md b/examples/trl/README.md index 05464309d5..5e488e7072 100644 --- a/examples/trl/README.md +++ b/examples/trl/README.md @@ -79,103 +79,10 @@ $ pip install -U -r requirements.txt ### Training -#### For meta-llama/Llama-2-7b-hf - -The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model. -There are two main steps to the DPO training process: -1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se: - - ``` - python ../gaudi_spawn.py --world_size 8 --use_mpi sft.py \ - --model_name_or_path meta-llama/Llama-2-7b-hf \ - --dataset_name "lvwerra/stack-exchange-paired" \ - --output_dir="./sft" \ - --max_steps=500 \ - --logging_steps=10 \ - --save_steps=100 \ - --do_train \ - --per_device_train_batch_size=4 \ - --per_device_eval_batch_size=1 \ - --gradient_accumulation_steps=2 \ - --learning_rate=1e-4 \ - --lr_scheduler_type="cosine" \ - --warmup_steps=100 \ - --weight_decay=0.05 \ - --optim="paged_adamw_32bit" \ - --lora_target_modules "q_proj" "v_proj" \ - --bf16 \ - --remove_unused_columns=False \ - --run_name="sft_llama2" \ - --report_to=none \ - --use_habana \ - --use_lazy_mode - ``` - To merge the adaptors to get the final sft merged checkpoint, we can use the `merge_peft_adapter.py` helper script that comes with TRL: - ``` - python merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="sft" --output_name="sft/final_merged_checkpoint" - ``` - -2. Run the DPO trainer using the model saved by the previous step: - ``` - python ../gaudi_spawn.py --world_size 8 --use_mpi dpo.py \ - --model_name_or_path="sft/final_merged_checkpoint" \ - --tokenizer_name_or_path=meta-llama/Llama-2-7b-hf \ - --lora_target_modules "q_proj" "v_proj" "k_proj" "out_proj" "fc_in" "fc_out" "wte" \ - --output_dir="dpo" \ - --report_to=none - ``` - -#### mistralai/Mistral-7B-v0.1 - -1. Supervised fine-tuning of the base Mistral-7B-v0.1 model: - - ``` - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed sft.py \ - --model_name_or_path mistralai/Mistral-7B-v0.1 \ - --dataset_name "lvwerra/stack-exchange-paired" \ - --deepspeed ../language-modeling/llama2_ds_zero3_config.json \ - --output_dir="./sft" \ - --do_train \ - --max_steps=500 \ - --logging_steps=10 \ - --save_steps=100 \ - --per_device_train_batch_size=1 \ - --per_device_eval_batch_size=1 \ - --gradient_accumulation_steps=2 \ - --learning_rate=1e-4 \ - --lr_scheduler_type="cosine" \ - --warmup_steps=100 \ - --weight_decay=0.05 \ - --optim="paged_adamw_32bit" \ - --lora_target_modules "q_proj" "v_proj" \ - --bf16 \ - --remove_unused_columns=False \ - --run_name="sft_mistral" \ - --report_to=none \ - --use_habana \ - --use_lazy_mode - ``` - To merge the adaptors to get the final sft merged checkpoint, we can use the `merge_peft_adapter.py` helper script that comes with TRL: - - ``` - python merge_peft_adapter.py --base_model_name="mistralai/Mistral-7B-v0.1" --adapter_model_name="sft" --output_name="sft/final_merged_checkpoint" - ``` - -2. Run the DPO trainer using the model saved by the previous step: - ``` - DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 python ../gaudi_spawn.py --world_size 8 --use_deepspeed dpo.py \ - --model_name_or_path="sft/final_merged_checkpoint" \ - --tokenizer_name_or_path=mistralai/Mistral-7B-v0.1 \ - --deepspeed ../language-modeling/llama2_ds_zero3_config.json \ - --lora_target_modules "q_proj" "v_proj" "k_proj" "out_proj" "fc_in" "fc_out" "wte" \ - --output_dir="dpo" \ - --max_prompt_length=256 \ - --max_length=512 \ - --report_to=none - ``` - #### For meta-llama/Llama-2-70b-hf +The following example is for the creation of StackLlaMa 2: a Stack exchange llama-v2-70b model. There are two main steps to the DPO training process. + For large model like Llama2-70B, we could use DeepSpeed Zero-3 to enable DPO training in multi-card. steps like: 1. Supervised fine-tuning of the base llama-v2-70b model to create llama-v2-70b-se: @@ -360,7 +267,8 @@ python ddpo.py \ --use_hpu_graphs \ --bf16 \ --hf_hub_model_id="ddpo-finetuned-stable-diffusion" \ - --push_to_hub False + --push_to_hub False \ + --sdp_on_bf16 ``` > [!NOTE] > Due to a known issue on Gaudi3, sample_batch_sizes should be changed to 3. The issue will be fixed in the future release. diff --git a/examples/trl/ddpo.py b/examples/trl/ddpo.py index 46caf64c49..a2f1f15733 100644 --- a/examples/trl/ddpo.py +++ b/examples/trl/ddpo.py @@ -79,6 +79,9 @@ class ScriptArguments: push_to_hub: bool = field(default=False, metadata={"help": "Whether or not to push the model to the Hub."}) use_habana: bool = field(default=True, metadata={"help": "Whether or not to use HPU."}) use_hpu_graphs: bool = field(default=True, metadata={"help": "Whether or not to use hpu graphs."}) + sdp_on_bf16: bool = field( + default=False, metadata={"help": "Allow pyTorch to use reduced precision in the SDPA math backend."} + ) class MLP(nn.Module): @@ -225,6 +228,7 @@ def image_outputs_logger(image_data, global_step, accelerate_logger): use_habana=args.use_habana, use_hpu_graphs=args.use_hpu_graphs, gaudi_config=gaudi_config, + sdp_on_bf16=args.sdp_on_bf16, ) trainer = GaudiDDPOTrainer( diff --git a/examples/text-to-video/README.md b/examples/video-comprehension/README.md similarity index 55% rename from examples/text-to-video/README.md rename to examples/video-comprehension/README.md index 1df4e44e59..da54f26740 100644 --- a/examples/text-to-video/README.md +++ b/examples/video-comprehension/README.md @@ -1,12 +1,9 @@ -# Text to Video Examples - -This directory contains a script that showcases how to use the `GaudiTextToVideoSDPipeline` to run text-to-video generation tasks on HPUs. - -## Requirements +# Examples -First, you should install the requirements: - -```bash -pip install -r requirements.txt -``` +This directory contains example scripts that demonstrate how to perform video comprehension on Gaudi with graph mode. ## Single-HPU inference +### Video-LLaVA Model + ```bash -python3 text_to_video_generation.py \ - --model_name_or_path ali-vilab/text-to-video-ms-1.7b \ - --prompts "An astronaut riding a horse" \ - --use_habana \ +python3 run_example.py \ + --model_name_or_path "LanguageBind/Video-LLaVA-7B-hf" \ + --warmup 3 \ + --n_iterations 5 \ + --batch_size 1 \ --use_hpu_graphs \ - --dtype bf16 + --bf16 \ + --output_dir ./ ``` - Models that have been validated: - - [ali-vilab/text-to-video-ms-1.7b](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b) + - [LanguageBind/Video-LLaVA-7B-hf ](https://huggingface.co/LanguageBind/Video-LLaVA-7B-hf) diff --git a/examples/video-comprehension/requirements.txt b/examples/video-comprehension/requirements.txt new file mode 100644 index 0000000000..7ed65352d9 --- /dev/null +++ b/examples/video-comprehension/requirements.txt @@ -0,0 +1,2 @@ +av == 12.1.0 +sentencepiece == 0.2.0 diff --git a/examples/video-comprehension/run_example.py b/examples/video-comprehension/run_example.py new file mode 100644 index 0000000000..5868bea3e8 --- /dev/null +++ b/examples/video-comprehension/run_example.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import argparse +import json +import logging +import os +import time +from pathlib import Path + +import av +import numpy as np +import torch +from huggingface_hub import hf_hub_download +from transformers import VideoLlavaProcessor + +from optimum.habana.transformers.modeling_utils import ( + GaudiVideoLlavaForConditionalGeneration, + adapt_transformers_to_gaudi, +) + + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def read_video_pyav(container, indices): + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + help="Path to pre-trained model", + ) + parser.add_argument( + "--video_path", + default=None, + type=str, + nargs="*", + help='Path to video as input. Can be a single string (eg: --image_path "URL1"), or a list of space-separated strings (eg: --video_path "URL1" "URL2")', + ) + parser.add_argument( + "--prompt", + default=None, + type=str, + help='Optional argument to give a prompt of your choice as input. is a single string (eg: --prompt "Hello world")', + ) + parser.add_argument( + "--use_hpu_graphs", + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument("--max_new_tokens", type=int, default=100, help="Number of tokens to generate.") + parser.add_argument( + "--bf16", + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Output directory to store results in.", + ) + parser.add_argument( + "--token", + default=None, + type=str, + help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations for benchmarking.") + parser.add_argument("--n_iterations", type=int, default=5, help="Number of inference iterations for benchmarking.") + parser.add_argument( + "--ignore_eos", + action="store_true", + help="Whether to disable stopping with eos token when calling `generate`.", + ) + parser.add_argument( + "--use_flash_attention", + action="store_true", + help="Whether to enable Habana Flash Attention, provided that the model supports it.", + ) + parser.add_argument( + "--flash_attention_recompute", + action="store_true", + help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", + ) + + args = parser.parse_args() + + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") + + if args.video_path is None: + args.video_path = [ + hf_hub_download( + repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset" + ) + ] + + if args.prompt is None: + args.prompt = ["USER: