From c3dc09354df362c95d415f93a19226a4cad7e24d Mon Sep 17 00:00:00 2001 From: Qing Lan Date: Tue, 23 Apr 2024 14:48:29 -0700 Subject: [PATCH] [CI][DeepSpeed Deprecation] Batch 1: deepspeed and Umerged LoRA test removal (#1796) --- .github/workflows/llm_integration.yml | 469 +----------------- .../workflows/rolling_batch_integration.yml | 75 +-- tests/integration/llm/client.py | 236 +-------- tests/integration/llm/deepspeed-model.py | 134 ----- tests/integration/llm/prepare.py | 388 --------------- tests/integration/llm/unmerged_lora.py | 129 ----- 6 files changed, 4 insertions(+), 1427 deletions(-) delete mode 100644 tests/integration/llm/deepspeed-model.py delete mode 100644 tests/integration/llm/unmerged_lora.py diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index f9782f8fc..8f66b7240 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -8,7 +8,7 @@ on: required: false default: '' run_test: - description: 'Run only the tests you need [ds, hf, aot, trtllm, lora-correctness, smoothquant]' + description: 'Run only the tests you need [ hf, trtllm ]' required: false default: '' schedule: @@ -37,74 +37,9 @@ jobs: --fail \ | jq '.token' | tr -d '"' ) ./start_instance.sh action_g5 $token djl-serving - - name: Create new G5 instance - id: create_gpu3 - run: | - cd /home/ubuntu/djl_benchmark_script/scripts - token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ - https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ - --fail \ - | jq '.token' | tr -d '"' ) - ./start_instance.sh action_g5 $token djl-serving outputs: gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g5_instance_id }} gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g5_instance_id }} - gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g5_instance_id }} - - ds-raw-test: - if: contains(fromJson('["", "ds"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g5 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests numpy - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test bloom-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_raw bloom-7b1 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py deepspeed_raw bloom-7b1 - docker rm -f $(docker ps -aq) - - name: Test GPTJ-6B - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_raw gpt-j-6b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py deepspeed_raw gpt-j-6b - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: ds-raw-logs - path: tests/integration/logs/ hf-handler-test: if: contains(fromJson('["", "hf"]'), github.event.inputs.run_test) @@ -222,404 +157,6 @@ jobs: name: hf-handler-${{ matrix.arch }}-logs path: tests/integration/logs/ - hf-lora-correctness-test: - if: contains(fromJson('["", "lora-correctness"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g5 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests numpy - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test unmerged lora llama-7b correctness - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py unmerged_lora llama-7b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py unmerged_lora llama-7b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: hf-lora-correctness-${{ matrix.arch }}-logs - path: tests/integration/logs/ - - ds-handler-test: - if: contains(fromJson('["", "ds"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g5 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests pillow numpy - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test stable-diffusion-2-1-base - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py stable-diffusion stable-diffusion-2-1-base - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py stable-diffusion stable-diffusion-2-1-base - docker rm -f $(docker ps -aq) - - name: Test stable-diffusion-v1-5 - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py stable-diffusion stable-diffusion-v1-5 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py stable-diffusion stable-diffusion-v1-5 - docker rm -f $(docker ps -aq) - - name: Test stable-diffusion-2-depth - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py stable-diffusion stable-diffusion-2-depth - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py stable-diffusion stable-diffusion-2-depth - docker rm -f $(docker ps -aq) - - name: Test bloom-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed bloom-7b1 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed bloom-7b1 - docker rm -f $(docker ps -aq) - - name: Test LLAMA-7B - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed open-llama-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed open-llama-7b - docker rm -f $(docker ps -aq) - - name: Test GPTJ-6B - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed gpt-j-6b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed gpt-j-6b - docker rm -f $(docker ps -aq) - - name: Test OPT-13B - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed opt-13b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed opt-13b - docker rm -f $(docker ps -aq) - - name: Test gpt4all-lora - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed gpt4all-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed gpt4all-lora - docker rm -f $(docker ps -aq) - - name: Test streaming gpt-neo-1.3b - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=1,3" > docker_env - python3 llm/prepare.py deepspeed gpt-neo-1.3b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed gpt-neo-1.3b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: ds-handler-logs - path: tests/integration/logs/ - - ds-aot-raw-test: - if: contains(fromJson('["", "aot"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g5 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests numpy - - name: Install s5cmd - working-directory: serving/docker - run: sudo scripts/install_s5cmd.sh x64 - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: serving/docker - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test gpt-neo-2.7b partition - working-directory: tests/integration - run: | - sudo rm -rf models - python3 llm/prepare.py deepspeed_aot gpt-neo-2.7b - - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - partition --model-dir /opt/ml/input/data/training | tee partition_output.log - - # checking if pt files are generated. - sudo mv $PWD/models/test/partition-test $PWD/models/ - if ls $PWD/models/partition-test/*.pt &>/dev/null ; then echo "checkpoint files generated"; else exit 1; fi - - name: Test gpt-neo-2.7b inference - working-directory: tests/integration - run: | - sudo cp $PWD/models/test/model.py $PWD/models/partition-test - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve -m test=file:/opt/ml/model/partition-test/ - curl http://127.0.0.1:8080/models - python3 llm/client.py deepspeed_aot gpt-neo-2.7b - docker rm -f $(docker ps -aq) - - name: Remove models dir - working-directory: tests/integration - run: | - sudo rm -rf models - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - sudo rm -rf models - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: ds-aot-${{ matrix.arch }}-logs - path: tests/integration/logs/ - - ds-handler-aot-test: - if: contains(fromJson('["", "aot"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g5 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests numpy - - name: Install s5cmd - working-directory: serving/docker - run: sudo scripts/install_s5cmd.sh x64 - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: serving/docker - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test opt-6.7b partition - working-directory: tests/integration - run: | - sudo rm -rf models - python3 llm/prepare.py deepspeed_handler_aot opt-6.7b - # To test the requirements.txt download. - echo "dummy_test" >> $PWD/models/test/requirements.txt - - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - partition --model-dir /opt/ml/input/data/training/ | tee partition_output.log - - # checking if pt files are generated. - sudo mv $PWD/models/test/partition-test $PWD/models/ - if ls $PWD/models/partition-test/*.pt &>/dev/null ; then echo "checkpoint files generated"; else exit 1; fi - - # checking whether requirements.txt download is successful - if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \ - then echo "requirements.txt install was successful"; else exit 1; fi - - name: Test opt-6.7b inference - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve -m test=file:/opt/ml/model/partition-test/ - python3 llm/client.py deepspeed_handler_aot opt-6.7b - docker rm -f $(docker ps -aq) - sudo rm -rf models - - name: Test bloom-7b1 partition - working-directory: tests/integration - run: | - sudo rm -rf models - python3 llm/prepare.py deepspeed_handler_aot bloom-7b1 - - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - train | tee partition_output.log - - # checking if pt files are generated. - # downloads the uploaded partitioned checkpoints from s3url. - /opt/djl/bin/s5cmd --retry-count 1 sync s3://djl-llm/bloom-7b1-tp4/ds-aot-handler/* $PWD/models/partition-test - if ls $PWD/models/partition-test/*.pt &>/dev/null ; then echo "checkpoint files generated"; else exit 1; fi - if ls $PWD/models/partition-test/ds_inference_config.json &>/dev/null ; \ - then echo "ds_inference_config.json generated"; else exit 1; fi - - name: Test bloom-7b1 inference - working-directory: tests/integration - run: | - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve -m test=file:/opt/ml/model/partition-test/ - python3 llm/client.py deepspeed_handler_aot bloom-7b1 - docker rm -f $(docker ps -aq) - - name: Remove models dir - working-directory: tests/integration - run: | - sudo rm -rf models - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - sudo rm -rf models - docker rm -f $(docker ps -aq) || true - name: ds-aot-handler-logs - - ds-smoothquant-handler-test: - if: contains(fromJson('["", "smoothquant"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g5 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests pillow numpy - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test gpt-j-6b default smoothquant - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_smoothquant gpt-j-6b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_smoothquant gpt-j-6b - docker rm -f $(docker ps -aq) - - name: Test gpt-neox-20b smoothquant custom alpha - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_smoothquant gpt-neox-20b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_smoothquant gpt-neox-20b - docker rm -f $(docker ps -aq) - - name: Test llama2-13b dynamic_quant only - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_smoothquant llama2-13b-dynamic-int8 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_smoothquant llama2-13b-dynamic-int8 - docker rm -f $(docker ps -aq) - - name: Test llama2-13b smoothquant - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_smoothquant llama2-13b-smoothquant - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_smoothquant llama2-13b-smoothquant - docker rm -f $(docker ps -aq) - - name: Remove models dir - working-directory: tests/integration - run: | - sudo rm -rf models - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - sudo rm -rf models - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: ds-smoothquant-logs - path: tests/integration/logs/ - trt-llm-handler-test: if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test) runs-on: [ self-hosted, g5 ] @@ -829,7 +366,7 @@ jobs: stop-runners: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners, hf-handler-test, hf-lora-correctness-test, ds-raw-test, ds-handler-test, ds-aot-raw-test, ds-handler-aot-test, ds-smoothquant-handler-test, trt-llm-handler-test, trt-llm-handler-test-2] + needs: [ create-runners, hf-handler-test, trt-llm-handler-test, trt-llm-handler-test-2] steps: - name: Stop all instances run: | @@ -838,5 +375,3 @@ jobs: ./stop_instance.sh $instance_id instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }} ./stop_instance.sh $instance_id - instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }} - ./stop_instance.sh $instance_id diff --git a/.github/workflows/rolling_batch_integration.yml b/.github/workflows/rolling_batch_integration.yml index e22bd6fe3..0df9e96cc 100644 --- a/.github/workflows/rolling_batch_integration.yml +++ b/.github/workflows/rolling_batch_integration.yml @@ -528,83 +528,10 @@ jobs: name: vllm-logs path: tests/integration/logs/ - deepspeed-test: - if: contains(fromJson('["", "deepspeed"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g5 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests pillow numpy - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh deepspeed ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test deepspeed_rolling_batch gpt-neox-20b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_rolling_batch gpt-neox-20b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_rolling_batch gpt-neox-20b - docker rm -f $(docker ps -aq) - - name: Test deepspeed_rolling_batch open-llama-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_rolling_batch open-llama-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_rolling_batch open-llama-7b - docker rm -f $(docker ps -aq) - - name: Test deepspeed_rolling_batch gpt2 - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_rolling_batch gpt2 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_rolling_batch gpt2 - docker rm -f $(docker ps -aq) - - name: Test deepspeed_rolling_batch llama2-13b-smoothquant - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py deepspeed_rolling_batch llama2-13b-smoothquant - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models deepspeed \ - serve - python3 llm/client.py deepspeed_rolling_batch llama2-13b-smoothquant - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: ds-rolling-batch-handler-logs - path: tests/integration/logs/ - stop-runners: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test, deepspeed-test ] + needs: [ create-runners, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test ] steps: - name: Stop all instances run: | diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 72f22f4f4..b95747704 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -59,27 +59,6 @@ def get_model_name(): return res["models"][0]["modelName"] -ds_raw_model_spec = { - "gpt-j-6b": { - "max_memory_per_gpu": [6.0, 6.0, 6.0, 6.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "use_pipeline": True - }, - "bloom-7b1": { - "max_memory_per_gpu": [7.0, 7.0, 8.0, 9.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "use_pipeline": False - }, - "opt-30b": { - "max_memory_per_gpu": [16.0, 16.0, 16.0, 16.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "use_pipeline": False - } -} - hf_model_spec = { "gpt-neo-2.7b": { "max_memory_per_gpu": [8.0, 8.0, 9.0, 17.0], @@ -132,64 +111,6 @@ def get_model_name(): } } -ds_model_spec = { - "gpt-j-6b": { - "max_memory_per_gpu": [9.0, 10.0, 11.0, 12.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "worker": 2 - }, - "bloom-7b1": { - "max_memory_per_gpu": [7.0, 8.0, 8.0, 9.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256] - }, - "open-llama-7b": { - "max_memory_per_gpu": [8.0, 7.0, 7.0, 7.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256] - }, - "opt-13b": { - "max_memory_per_gpu": [17.0, 18.0, 19.0, 22.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "worker": 2 - }, - "gpt-neo-1.3b": { - "max_memory_per_gpu": [4.0, 5.0], - "batch_size": [1, 4], - "seq_length": [16], - "worker": 1, - "stream_output": True, - }, - "gpt4all-lora": { - "max_memory_per_gpu": [10.0, 12.0], - "batch_size": [1, 4], - "seq_length": [16, 32], - "worker": 1, - } -} - -sd_model_spec = { - "stable-diffusion-v1-5": { - "max_memory_per_gpu": 8.0, - "size": [256, 512], - "num_inference_steps": [50, 100] - }, - "stable-diffusion-2-1-base": { - "max_memory_per_gpu": 8.0, - "size": [256, 512], - "num_inference_steps": [50, 100], - "workers": 2 - }, - "stable-diffusion-2-depth": { - "max_memory_per_gpu": 8.0, - "size": [512], - "num_inference_steps": [50], - "depth": True - } -} - neuron_sd_model_spec = { "stable-diffusion-1.5-neuron": { "num_inference_steps": [50, 100] @@ -202,27 +123,6 @@ def get_model_name(): } } -ds_aot_model_spec = { - "opt-6.7b": { - "max_memory_per_gpu": [12.0, 12.0, 12.0, 12.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "use_pipeline": True - }, - "bloom-7b1": { - "max_memory_per_gpu": [12.0, 12.0, 12.0, 12.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "use_pipeline": False - }, - "gpt-neo-2.7b": { - "max_memory_per_gpu": [12.0, 12.0, 12.0, 17.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - "use_pipeline": True - } -} - transformers_neuronx_model_spec = { "gpt2": { "worker": 1, @@ -488,29 +388,6 @@ def get_model_name(): } } -ds_smoothquant_model_spec = { - "gpt-j-6b": { - "max_memory_per_gpu": [6.0, 6.0, 6.0, 6.0], - "batch_size": [1, 2, 4, 8], - "seq_length": [64, 128, 256], - }, - "gpt-neox-20b": { - "max_memory_per_gpu": [15.0, 15.0], - "batch_size": [1, 8], - "seq_length": [64, 128, 256], - }, - "llama2-13b-dynamic-int8": { - "max_memory_per_gpu": [9.0, 9.0, 9.0], - "batch_size": [1, 2, 4], - "seq_length": [64, 128, 256], - }, - "llama2-13b-smoothquant": { - "max_memory_per_gpu": [9.2, 9.2, 9.2], - "batch_size": [2, 4, 8], - "seq_length": [64, 128, 256], - }, -} - lmi_dist_aiccl_model_spec = { "llama-2-70b-aiccl": { "max_memory_per_gpu": [40.0], @@ -628,33 +505,6 @@ def get_model_name(): } } -deepspeed_rolling_batch_model_spec = { - "gpt-neox-20b": { - "max_memory_per_gpu": [25.0], - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "EleutherAI/gpt-neox-20b" - }, - "open-llama-7b": { - "max_memory_per_gpu": [25.0], - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "openlm-research/open_llama_7b" - }, - "gpt2": { - "max_memory_per_gpu": [25.0], - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "gpt2" - }, - "llama2-13b-smoothquant": { - "max_memory_per_gpu": [21.0], - "batch_size": [1, 4], - "seq_length": [256], - "tokenizer": "TheBloke/Llama-2-13B-fp16" - }, -} - no_code_rolling_batch_spec = { "llama-7b": { "max_memory_per_gpu": [25.0], @@ -1231,46 +1081,6 @@ def test_performance(): log_metrics(response_times) -def test_sd_handler(model, model_spec): - from PIL import Image - - if model not in model_spec: - raise ValueError( - f"{model} is not one of the supporting models {list(sd_model_spec.keys())}" - ) - spec = sd_model_spec[model] - if "worker" in spec: - check_worker_number(spec["worker"]) - for size in spec["size"]: - for step in spec["num_inference_steps"]: - if "depth" in spec: - req = {"prompt": "two tigers"} - params = { - "negative_prompt": "bad, deformed, ugly, bad anotomy", - "strength": 0.7 - } - url = "http://images.cocodataset.org/val2017/000000039769.jpg" - req["parameters"] = params - logging.info(f"req: {req}") - res = send_image_json(url, req) - else: - req = {"prompt": "A bird and cat flying through space"} - params = { - "height": size, - "width": size, - "num_inference_steps": step - } - req["parameters"] = params - logging.info(f"req: {req}") - res = send_json(req) - try: - Image.open(BytesIO(res.content)).convert("RGB") - except Exception as e: - raise IOError("failed to deserialize image from response", e) - if "max_memory_per_gpu" in spec: - validate_memory_usage(spec["max_memory_per_gpu"]) - - def test_neuron_sd_handler(model, model_spec): from PIL import Image if model not in model_spec: @@ -1316,48 +1126,11 @@ def test_transformers_neuronx_handler(model, model_spec): assert len(result) == batch_size -def test_ds_smoothquant(model, model_spec): - if model not in model_spec: - raise ValueError( - f"{args.model} is not one of the supporting models {list(model_spec.keys())}" - ) - spec = model_spec[args.model] - for i, batch_size in enumerate(spec["batch_size"]): - for seq_length in spec["seq_length"]: - req = { - "inputs": batch_generation(batch_size), - "batch_size": batch_size, - "text_length": seq_length - } - logging.info(f"req: {req}") - res = send_json(req) - res = res.json() - logging.info(f"res: {res}") - assert len(res) == batch_size - if "max_memory_per_gpu" in spec: - validate_memory_usage(spec["max_memory_per_gpu"][i]) - - -def test_unmerged_lora_correctness(): - res = send_json({}) - logging.info(f"res: {res.json()}") - - if __name__ == "__main__": - if args.handler == "deepspeed_raw": - test_ds_raw_model(args.model, ds_raw_model_spec) - elif args.handler == "huggingface": + if args.handler == "huggingface": test_handler(args.model, hf_model_spec) - elif args.handler == "deepspeed": - test_handler(args.model, ds_model_spec) - elif args.handler == "stable-diffusion": - test_sd_handler(args.model, sd_model_spec) elif args.handler == "neuron-stable-diffusion": test_neuron_sd_handler(args.model, neuron_sd_model_spec) - elif args.handler == "deepspeed_aot": - test_ds_raw_model(args.model, ds_aot_model_spec) - elif args.handler == "deepspeed_handler_aot": - test_handler(args.model, ds_aot_model_spec) elif args.handler == "transformers_neuronx": test_transformers_neuronx_handler(args.model, transformers_neuronx_model_spec) @@ -1380,19 +1153,12 @@ def test_unmerged_lora_correctness(): test_handler_rolling_batch_chat(args.model, vllm_chat_model_spec) elif args.handler == "performance": test_performance() - elif args.handler == "unmerged_lora": - test_unmerged_lora_correctness() - elif args.handler == "deepspeed_smoothquant": - test_ds_smoothquant(args.model, ds_smoothquant_model_spec) elif args.handler == "lmi_dist_aiccl": test_handler_rolling_batch(args.model, lmi_dist_aiccl_model_spec) elif args.handler == "trtllm": test_handler_rolling_batch(args.model, trtllm_model_spec) elif args.handler == "trtllm-python": test_handler(args.model, trtllm_model_spec) - elif args.handler == "deepspeed_rolling_batch": - test_handler_rolling_batch(args.model, - deepspeed_rolling_batch_model_spec) elif args.handler == "no_code": test_handler_rolling_batch(args.model, no_code_rolling_batch_spec) diff --git a/tests/integration/llm/deepspeed-model.py b/tests/integration/llm/deepspeed-model.py deleted file mode 100644 index de83f232b..000000000 --- a/tests/integration/llm/deepspeed-model.py +++ /dev/null @@ -1,134 +0,0 @@ -from djl_python import Input, Output -import deepspeed -import torch -import logging -import math -import os -from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig - -torch.manual_seed(1234) - - -def get_torch_dtype_from_str(dtype: str): - if dtype == "float32": - return torch.float32 - if dtype == "float16": - return torch.float16 - if dtype == "bfloat16": - return torch.bfloat16 - if dtype == "int8": - return torch.int8 - raise ValueError(f"Invalid data type: {dtype}") - - -def load_model(properties): - tensor_parallel = properties["tensor_parallel_degree"] - model_location = properties['model_dir'] - if "model_id" in properties: - model_location = properties['model_id'] - logging.info(f"Loading model in {model_location}") - checkpoint = None - if "checkpoint" in properties: - checkpoint = os.path.join(model_location, properties['checkpoint']) - - data_type = get_torch_dtype_from_str(properties.get("dtype", "float16")) - - dtype = torch.float16 if data_type == torch.int8 else data_type - kwargs = {"torch_dtype": dtype} if dtype else {} - ds_kwargs = dict() - if checkpoint: - config_file = os.path.join(model_location, "config.json") - config = AutoConfig.from_pretrained(config_file) - with deepspeed.OnDevice(dtype=dtype, device="meta"): - model = AutoModelForCausalLM.from_config(config, **kwargs) - - ds_kwargs["checkpoint"] = checkpoint - ds_kwargs["base_dir"] = model_location - else: - model = AutoModelForCausalLM.from_pretrained(model_location, - low_cpu_mem_usage=True, - **kwargs) - - tokenizer = AutoTokenizer.from_pretrained(model_location) - logging.info(f"Starting DeepSpeed init with TP={tensor_parallel}") - model = deepspeed.init_inference( - model, - tensor_parallel={"tp_size": tensor_parallel}, - dtype=model.dtype, - replace_method='auto', - replace_with_kernel_inject=True, - max_tokens=1024, - save_mp_checkpoint_path=properties.get("save_mp_checkpoint_path"), - **ds_kwargs) - return model.module, tokenizer - - -def batch_generation(batch_size): - input_sentences = [ - "DeepSpeed is a machine learning framework", - "He is working on", - "He has a", - "He got all", - "Everyone is happy and I can", - "The new movie that got Oscar this year", - "In the far far distance from our galaxy,", - "Peace is the only way", - ] - if batch_size > len(input_sentences): - # dynamically extend to support larger bs by repetition - input_sentences *= math.ceil(batch_size / len(input_sentences)) - return input_sentences[:batch_size] - - -model = None -tokenizer = None -generator = None - - -def separate_inference(model, tokenizer, batch_size, length): - generate_kwargs = dict(max_new_tokens=length, do_sample=True) - input_tokens = tokenizer.batch_encode_plus(batch_generation(batch_size), - return_tensors="pt", - padding=True) - for t in input_tokens: - if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(torch.cuda.current_device()) - outputs = model.generate(**input_tokens, **generate_kwargs) - return tokenizer.batch_decode(outputs, skip_special_tokens=True) - - -def pipeline_inference(model, tokenizer, batch_size, length): - global generator - if not generator: - local_rank = int(os.getenv('LOCAL_RANK', '0')) - generator = pipeline(task='text-generation', - model=model, - tokenizer=tokenizer, - device=local_rank) - outputs = generator(batch_generation(batch_size), max_length=length) - return [item[0]['generated_text'] for item in outputs] - - -def partition(inputs: Input): - load_model(inputs.get_properties()) - - -def handle(inputs: Input): - global model, tokenizer - if not model: - model, tokenizer = load_model(inputs.get_properties()) - - if inputs.is_empty(): - # Model server makes an empty call to warmup the model on startup - return None - data = inputs.get_as_json() - batch_size = data["batch_size"] - tokens_to_gen = data["text_length"] - if data["use_pipeline"]: - outputs = pipeline_inference(model, tokenizer, batch_size, - tokens_to_gen) - else: - outputs = separate_inference(model, tokenizer, batch_size, - tokens_to_gen) - result = {"outputs": outputs} - return Output().add_as_json(result) diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index a4babd724..959ab0ede 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -20,64 +20,6 @@ help='The model tensor parallel degree') args = parser.parse_args() -ds_aot_list = { - "gpt-neo-2.7b": { - "option.model_id": - "EleutherAI/gpt-neo-2.7B", - "option.tensor_parallel_degree": - 2, - "option.task": - "text-generation", - "option.dtype": - "float16", - "option.save_mp_checkpoint_path": - "/opt/ml/input/data/training/partition-test" - }, -} - -ds_aot_handler_list = { - "opt-6.7b": { - "option.model_id": - "s3://djl-llm/opt-6b7/", - "option.tensor_parallel_degree": - 4, - "option.task": - "text-generation", - "option.dtype": - "fp16", - "option.save_mp_checkpoint_path": - "/opt/ml/input/data/training/partition-test" - }, - "bloom-7b1": { - "option.model_id": - "s3://djl-llm/bloom-7b1/", - "option.tensor_parallel_degree": - 4, - "option.task": - "text-generation", - "option.dtype": - "fp16", - "option.save_mp_checkpoint_path": - "s3://djl-llm/bloom-7b1-tp4/ds-aot-handler/" - } -} - -ds_model_list = { - "gpt-j-6b": { - "option.model_id": "s3://djl-llm/gpt-j-6b/", - "option.tensor_parallel_degree": 4 - }, - "bloom-7b1": { - "option.model_id": "s3://djl-llm/bloom-7b1/", - "option.tensor_parallel_degree": 4, - "option.dtype": "float16" - }, - "opt-30b": { - "option.model_id": "s3://djl-llm/opt-30b/", - "option.tensor_parallel_degree": 4 - } -} - hf_handler_list = { "gpt-neo-2.7b": { "option.model_id": "EleutherAI/gpt-neo-2.7B", @@ -137,108 +79,7 @@ } } -ds_handler_list = { - "gpt-j-6b": { - "option.model_id": "s3://djl-llm/gpt-j-6b/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 2, - "option.dtype": "bf16", - "option.enable_streaming": False - }, - "bloom-7b1": { - "option.model_id": "s3://djl-llm/bloom-7b1/", - "option.task": "text-generation", - "option.dtype": "fp16", - "option.enable_streaming": False - }, - "open-llama-7b": { - "option.model_id": "s3://djl-llm/open-llama-7b/", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.dtype": "fp16", - "option.enable_streaming": False - }, - "opt-13b": { - "option.model_id": "s3://djl-llm/opt-13b/", - "option.tensor_parallel_degree": 2, - "option.task": "text-generation", - "option.dtype": "fp16", - "option.enable_streaming": False - }, - "gpt-neo-1.3b": { - "option.model_id": "EleutherAI/gpt-neo-1.3B", - "option.task": "text-generation", - "option.tensor_parallel_degree": 2, - "option.dtype": "fp16", - "option.enable_streaming": True - }, - "gpt4all-lora": { - "option.model_id": "nomic-ai/gpt4all-lora", - "option.tensor_parallel_degree": 4, - "option.task": "text-generation", - "option.dtype": "fp16", - "option.enable_streaming": False - } -} - -sd_handler_list = { - "stable-diffusion-v1-5": { - "option.model_id": "s3://djl-llm/stable-diffusion-v1-5/", - "option.tensor_parallel_degree": 4, - "option.dtype": "fp16" - }, - "stable-diffusion-2-1-base": { - "option.model_id": "s3://djl-llm/stable-diffusion-2-1-base/", - "option.tensor_parallel_degree": 2, - "option.dtype": "fp16" - }, - "stable-diffusion-2-depth": { - "option.model_id": "s3://djl-llm/stable-diffusion-2-depth/", - "option.tensor_parallel_degree": 1, - "option.dtype": "fp16", - "gpu.maxWorkers": 1 - } -} - performance_test_list = { - "opt-30b-fp16-deepspeed": { - "option.task": "text-generation", - "option.model_id": "s3://djl-llm/opt-30b/", - "option.parallel_loading": "true", - "engine": "DeepSpeed", - "option.dtype": "fp16", - "option.rolling_batch": "deepspeed", - }, - "opt-30b-bf16-deepspeed": { - "option.task": "text-generation", - "option.model_id": "s3://djl-llm/opt-30b/", - "option.parallel_loading": "true", - "engine": "DeepSpeed", - "option.dtype": "bf16", - "option.rolling_batch": "deepspeed", - }, - "opt-30b-lmi-dist": { - "option.task": "text-generation", - "option.model_id": "s3://djl-llm/opt-30b/", - "engine": "MPI", - "option.rolling_batch": "deepspeed", - }, - "open-llama-13b-fp16-deepspeed": { - "option.task": "text-generation", - "option.dtype": "fp16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/open-llama-13b/", - "option.rolling_batch": "deepspeed", - "option.max_rolling_batch_size": 4, - }, - "open-llama-13b-bf16-deepspeed": { - "option.task": "text-generation", - "option.dtype": "bf16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/open-llama-13b/", - "option.rolling_batch": "deepspeed", - "option.max_rolling_batch_size": 4, - }, "open-llama-13b-fp16-lmi-dist": { "option.task": "text-generation", "option.dtype": "fp16", @@ -246,90 +87,17 @@ "option.model_id": "s3://djl-llm/open-llama-13b/", "option.rolling_batch": "lmi-dist", }, - "open-llama-13b-smoothquant": { - "option.task": "text-generation", - "option.model_id": "s3://djl-llm/open-llama-13b/", - "option.dtype": "fp16", - "engine": "DeepSpeed", - "option.quantize": "smoothquant", - "option.rolling_batch": "deepspeed", - "option.max_rolling_batch_size": 4, - }, - "gpt-j-6b-fp16-deepspeed": { - "option.task": "text-generation", - "option.dtype": "fp16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/gpt-j-6b/", - "option.rolling_batch": "deepspeed", - "option.max_rolling_batch_size": 4, - }, - "gpt-j-6b-bf16-deepspeed": { - "option.task": "text-generation", - "option.dtype": "bf16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/gpt-j-6b/", - "option.rolling_batch": "deepspeed", - "option.max_rolling_batch_size": 4, - }, - "gpt-j-6b-smoothquant": { - "option.task": "text-generation", - "option.dtype": "fp16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/gpt-j-6b/", - "option.quantize": "smoothquant", - "option.rolling_batch": "deepspeed", - "option.max_rolling_batch_size": 4, - }, - "bloom-7b1-fp16-deepspeed": { - "engine": "DeepSpeed", - "option.task": "text-generation", - "option.model_id": "s3://djl-llm/bloom-7b1/", - "option.dtype": "fp16", - "option.rolling_batch": "deepspeed", - }, - "bloom-7b1-bf16-deepspeed": { - "engine": "DeepSpeed", - "option.task": "text-generation", - "option.model_id": "s3://djl-llm/bloom-7b1/", - "option.dtype": "bf16", - "option.rolling_batch": "deepspeed", - }, "bloom-7b1-fp16-lmi-dist": { "engine": "MPI", "option.task": "text-generation", "option.rolling_batch": "lmi-dist", }, - "gpt-neox-20b-fp16-deepspeed": { - "option.task": "text-generation", - "option.dtype": "fp16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/gpt-neox-20b/", - "option.parallel_loading": "true", - "option.rolling_batch": "deepspeed", - }, "gpt-neox-20b-fp16-lmi-dist": { "option.task": "text-generation", "option.dtype": "fp16", "engine": "MPI", "option.model_id": "s3://djl-llm/gpt-neox-20b/", "option.rolling_batch": "lmi-dist", - }, - "gpt-neox-20b-bf16-deepspeed": { - "option.task": "text-generation", - "option.dtype": "bf16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/gpt-neox-20b/", - "option.parallel_loading": "true", - "option.rolling_batch": "deepspeed", - }, - "gpt-neox-20b-smoothquant": { - "option.task": "text-generation", - "option.dtype": "fp16", - "engine": "DeepSpeed", - "option.model_id": "s3://djl-llm/gpt-neox-20b/", - "option.quantize": "smoothquant", - "option.smoothquant_alpha": 0.65, - "option.rolling_batch": "deepspeed", } } @@ -753,38 +521,6 @@ } } -unmerged_lora_correctness_list = { - "llama-7b-unmerged-lora": { - "option.tensor_parallel_degree": 1, - "gpu.maxWorkers": 1, - "load_on_devices": 0, - } -} - -ds_smoothquant_model_list = { - "gpt-j-6b": { - "option.model_id": "s3://djl-llm/gpt-j-6b/", - "option.tensor_parallel_degree": 4, - "option.quantize": "smoothquant", - }, - "gpt-neox-20b": { - "option.model_id": "s3://djl-llm/gpt-neox-20b", - "option.tensor_parallel_degree": 4, - "option.quantize": "smoothquant", - "option.smoothquant_alpha": 0.65, - }, - "llama2-13b-dynamic-int8": { - "option.model_id": "TheBloke/Llama-2-13B-fp16", - "option.tensor_parallel_degree": 4, - "option.quantize": "dynamic_int8", - }, - "llama2-13b-smoothquant": { - "option.model_id": "TheBloke/Llama-2-13B-fp16", - "option.tensor_parallel_degree": 4, - "option.quantize": "smoothquant", - }, -} - lmi_dist_aiccl_model_list = { "llama-2-70b-aiccl": { "option.model_id": "s3://djl-llm/llama-2-70b-hf/", @@ -907,34 +643,6 @@ } } -deepspeed_rolling_batch_model_list = { - "gpt-neox-20b": { - "option.model_id": "s3://djl-llm/gpt-neox-20b", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - "open-llama-7b": { - "option.model_id": "s3://djl-llm/open-llama-7b", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4 - }, - "gpt2": { - "option.model_id": "gpt2", - "option.task": "text-generation", - "option.tensor_parallel_degree": 1, - "option.max_rolling_batch_size": 2 - }, - "llama2-13b-smoothquant": { - "option.model_id": "TheBloke/Llama-2-13B-fp16", - "option.task": "text-generation", - "option.tensor_parallel_degree": 4, - "option.max_rolling_batch_size": 4, - "option.quantize": "smoothquant", - }, -} - def write_model_artifacts(properties, requirements=None, @@ -988,36 +696,6 @@ def build_hf_handler_model(model): adapter_names=adapter_names) -def build_ds_handler_model(model): - if model not in ds_handler_list: - raise ValueError( - f"{model} is not one of the supporting handler {list(ds_handler_list.keys())}" - ) - options = ds_handler_list[model] - options["engine"] = "DeepSpeed" - # options["option.entryPoint"] = "djl_python.deepspeed" - write_model_artifacts(options) - - -def build_ds_raw_model(model): - options = ds_model_list[model] - options["engine"] = "DeepSpeed" - write_model_artifacts(options) - shutil.copyfile("llm/deepspeed-model.py", "models/test/model.py") - - -def build_ds_aot_model(model): - if model not in ds_aot_list: - raise ValueError( - f"{model} is not one of the supporting handler {list(ds_aot_list.keys())}" - ) - - options = ds_aot_list[model] - options["engine"] = "DeepSpeed" - write_model_artifacts(options) - shutil.copyfile("llm/deepspeed-model.py", "models/test/model.py") - - def build_performance_model(model): if model in performance_test_list.keys(): options = performance_test_list[model] @@ -1038,28 +716,6 @@ def build_performance_model(model): write_model_artifacts(options) -def build_ds_aot_handler_model(model): - if model not in ds_aot_handler_list: - raise ValueError( - f"{model} is not one of the supporting handler {list(ds_aot_handler_list.keys())}" - ) - - options = ds_aot_handler_list[model] - options["engine"] = "DeepSpeed" - write_model_artifacts(options) - - -def build_sd_handler_model(model): - if model not in sd_handler_list: - raise ValueError( - f"{model} is not one of the supporting handler {list(ds_handler_list.keys())}" - ) - options = sd_handler_list[model] - options["engine"] = "DeepSpeed" - options["option.entryPoint"] = "djl_python.stable-diffusion" - write_model_artifacts(options) - - def build_transformers_neuronx_handler_model(model): if model not in transformers_neuronx_handler_list.keys(): raise ValueError( @@ -1128,30 +784,6 @@ def build_vllm_model(model): adapter_names=adapter_names) -def build_unmerged_lora_correctness_model(model): - if model not in unmerged_lora_correctness_list: - raise ValueError( - f"{model} is not one of the supporting handler {list(unmerged_lora_correctness_list.keys())}" - ) - options = unmerged_lora_correctness_list[model] - options["engine"] = "Python" - write_model_artifacts(options) - shutil.copyfile("llm/unmerged_lora.py", "models/test/model.py") - - -def build_ds_smoothquant_model(model): - if model not in ds_smoothquant_model_list.keys(): - raise ValueError( - f"{model} is not one of the supporting handler {list(ds_smoothquant_model_list.keys())}" - ) - options = ds_smoothquant_model_list[model] - options["engine"] = "DeepSpeed" - options["entryPoint"] = "djl_python.deepspeed" - options["dtype"] = "fp16" - options["task"] = "text-generation" - write_model_artifacts(options) - - def build_lmi_dist_aiccl_model(model): if model not in lmi_dist_aiccl_model_list.keys(): raise ValueError( @@ -1178,27 +810,8 @@ def build_trtllm_handler_model(model): write_model_artifacts(options) -def build_deepspeed_rolling_batch_model(model): - if model not in deepspeed_rolling_batch_model_list.keys(): - raise ValueError( - f"{model} is not one of the supporting handler {list(deepspeed_rolling_batch_model_list.keys())}" - ) - options = deepspeed_rolling_batch_model_list[model] - options["engine"] = "DeepSpeed" - options["option.rolling_batch"] = "deepspeed" - options["option.output_formatter"] = "jsonlines" - write_model_artifacts(options) - - supported_handler = { - 'deepspeed': build_ds_handler_model, - "deepspeed_raw": build_ds_raw_model, - 'deepspeed_aot': build_ds_aot_model, - 'deepspeed_handler_aot': build_ds_aot_handler_model, - 'deepspeed_smoothquant': build_ds_smoothquant_model, - 'deepspeed_rolling_batch': build_deepspeed_rolling_batch_model, 'huggingface': build_hf_handler_model, - 'stable-diffusion': build_sd_handler_model, 'transformers_neuronx': build_transformers_neuronx_handler_model, 'transformers_neuronx_aot': build_transformers_neuronx_aot_handler_model, 'performance': build_performance_model, @@ -1206,7 +819,6 @@ def build_deepspeed_rolling_batch_model(model): 'lmi_dist': build_lmi_dist_model, 'lmi_dist_aiccl': build_lmi_dist_aiccl_model, 'vllm': build_vllm_model, - 'unmerged_lora': build_unmerged_lora_correctness_model, 'trtllm': build_trtllm_handler_model, } diff --git a/tests/integration/llm/unmerged_lora.py b/tests/integration/llm/unmerged_lora.py deleted file mode 100644 index 0058084d6..000000000 --- a/tests/integration/llm/unmerged_lora.py +++ /dev/null @@ -1,129 +0,0 @@ -from djl_python import Input, Output -from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig -from peft import PeftModel -import torch -import json -import logging - -BASE_MODEL_ID = "huggyllama/llama-7b" -LORA_ADAPTER_1_ID = "tloen/alpaca-lora-7b" -LORA_ADAPTER_2_ID = "22h/cabrita-lora-v0-1" -LORA_ADAPTER_1_NAME = "english-alpaca" -LORA_ADAPTER_2_NAME = "protugese-alpaca" - -model = None -tokenizer = None - - -def construct_error_output(output, err_msg): - error = {"code": 500, "error": err_msg} - error = json.dumps(error) - output.add(error, key="data") - return output - - -def generate_prompt(instruction): - return f"""Below is an instruction that describes a task. Write a response that appropriately completes the - request.### Instruction: {instruction} ### Response:""" - - -def load_model(): - global model, tokenizer - # load base model - model = LlamaForCausalLM.from_pretrained( - BASE_MODEL_ID, torch_dtype=torch.float16).to("cuda:0") - tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL_ID) - if not tokenizer.pad_token: - tokenizer.pad_token = '[PAD]' - - # load lora adapter 1 - model = PeftModel.from_pretrained(model, - LORA_ADAPTER_1_ID, - adapter_name=LORA_ADAPTER_1_NAME) - # load lora adapter 2 - model.load_adapter(LORA_ADAPTER_2_ID, adapter_name=LORA_ADAPTER_2_NAME) - - -def inference(): - global model, tokenizer - output = Output() - if len(model.peft_config.keys()) != 2: - return construct_error_output( - output, "Incorrect number of adapters registered") - - input1 = { - "inputs": "Tell me about Alpacas", - "adapter_name": LORA_ADAPTER_1_NAME - } - input2 = { - "inputs": - "Invente uma desculpa criativa pra dizer que não preciso ir à festa.", - "adapter_name": LORA_ADAPTER_2_NAME - } - - generation_config = GenerationConfig(num_beams=1, do_sample=False) - - prompts = [ - generate_prompt(input1["inputs"]), - generate_prompt(input2["inputs"]), - ] - - adapters = [ - input1["adapter_name"], - input2["adapter_name"], - ] - - inputs = tokenizer(prompts, return_tensors="pt", padding=True) - input_ids = inputs["input_ids"].to(torch.cuda.current_device()) - attention_mask = inputs["attention_mask"].to(torch.cuda.current_device()) - outputs = model.generate( - input_ids=input_ids, - attention_mask=attention_mask, - adapters=adapters, - generation_config=generation_config, - return_dict_in_generate=False, - max_new_tokens=64, - ) - outputs_unmerged_lora = tokenizer.batch_decode(outputs, - skip_special_tokens=True) - if len(outputs_unmerged_lora) != 2: - return construct_error_output(output, "Incorrect number of outputs") - - logging.info(f"outputs from unmerged lora: {outputs_unmerged_lora}") - - model.delete_adapter(LORA_ADAPTER_2_NAME) - if len(model.peft_config.keys()) != 1: - return construct_error_output( - output, "Incorrect number of adapters registered after delete op") - - # merge lora adapter 1 into base model - model.set_adapter(LORA_ADAPTER_1_NAME) - model.merge_and_unload() - outputs_lora_1 = model.generate( - input_ids=input_ids, - attention_mask=attention_mask, - generation_config=generation_config, - return_dict_in_generate=False, - max_new_tokens=64, - ) - - outputs_merged_lora = tokenizer.batch_decode(outputs_lora_1, - skip_special_tokens=True) - - prediction = [{ - 'unmerged_lora_result': outputs_unmerged_lora[0] - }, { - 'merged_lora_result': outputs_merged_lora[0] - }] - output.add_as_json(prediction, key="data") - return output - - -def handle(input: Input): - if not model: - load_model() - - if input.is_empty(): - return None - - return inference()