Skip to content

Commit

Permalink
Add vllm and lmi-dist lora awq tests (#1937)
Browse files Browse the repository at this point in the history
  • Loading branch information
rohithkrn authored May 24, 2024
1 parent 65f6fb7 commit 030a745
Show file tree
Hide file tree
Showing 4 changed files with 361 additions and 30 deletions.
179 changes: 154 additions & 25 deletions .github/workflows/llm_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
required: false
default: ''
run_test:
description: 'Run only the tests you need [ hf, trtllm, scheduler, lmi-dist, vllm ]'
description: 'Run only the tests you need [ hf, trtllm, scheduler, lmi-dist, vllm, vllm-lora, lmi-dist-lora ]'
required: false
default: ''
schedule:
Expand Down Expand Up @@ -566,24 +566,6 @@ jobs:
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py lmi_dist mpt-7b
docker rm -f $(docker ps -aq)
- name: Test lmi-dist unmerged lora - llama7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora
docker rm -f $(docker ps -aq)
- name: Test lmi-dist unmerged lora overflow - llama7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora-overflow
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora-overflow
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
Expand Down Expand Up @@ -802,6 +784,53 @@ jobs:
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py vllm gemma-7b
docker rm -f $(docker ps -aq)
- name: Test llama2-7b-chat
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py vllm llama2-7b-chat
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py vllm_chat llama2-7b-chat
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
run: |
docker rm -f $(docker ps -aq) || true
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v3
with:
name: vllm-logs
path: tests/integration/logs/


vllm-lora-test:
if: contains(fromJson('["", "vllm-lora"]'), github.event.inputs.run_test)
runs-on: [ self-hosted, g5 ]
timeout-minutes: 60
needs: create-runners
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install requests numpy huggingface_hub
- name: Build container name
run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
- name: Download docker
working-directory: tests/integration
run: |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
- name: Test vllm unmerged lora - llama7b
working-directory: tests/integration
run: |
Expand All @@ -820,14 +849,32 @@ jobs:
serve
python3 llm/client.py vllm_adapters llama-7b-unmerged-lora-overflow
docker rm -f $(docker ps -aq)
- name: Test llama2-7b-chat
- name: Test vllm lora awq - llama2-13b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py vllm llama2-7b-chat
python3 llm/prepare.py vllm llama2-13b-awq-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve -m test=file:/opt/ml/model/test/
python3 llm/client.py vllm_chat llama2-7b-chat
serve
python3 llm/client.py vllm_adapters llama2-13b-awq-unmerged-lora
docker rm -f $(docker ps -aq)
- name: Test vllm lora - mistral-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py vllm mistral-7b-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py vllm_adapters mistral-7b-unmerged-lora
docker rm -f $(docker ps -aq)
- name: Test vllm lora awq - mistral-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py vllm mistral-7b-awq-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py vllm_adapters mistral-7b-awq-unmerged-lora
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
Expand All @@ -838,13 +885,95 @@ jobs:
- name: Upload test logs
uses: actions/upload-artifact@v3
with:
name: vllm-logs
name: vllm-lora-logs
path: tests/integration/logs/

lmi-dist-lora-test:
if: contains(fromJson('["", "lmi-dist-lora"]'), github.event.inputs.run_test)
runs-on: [ self-hosted, g5 ]
timeout-minutes: 60
needs: create-runners
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install requests numpy huggingface_hub
- name: Build container name
run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
- name: Download docker
working-directory: tests/integration
run: |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
- name: Test lmi-dist unmerged lora - llama7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora
docker rm -f $(docker ps -aq)
- name: Test lmi-dist unmerged lora overflow - llama7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora-overflow
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora-overflow
docker rm -f $(docker ps -aq)
- name: Test lmi-dist lora awq - llama2-13b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist llama2-13b-awq-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_adapters llama2-13b-awq-unmerged-lora
docker rm -f $(docker ps -aq)
- name: Test lmi-dist lora - mistral-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist mistral-7b-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_adapters mistral-7b-unmerged-lora
docker rm -f $(docker ps -aq)
- name: Test lmi-dist lora awq - mistral-7b
working-directory: tests/integration
run: |
rm -rf models
python3 llm/prepare.py lmi_dist mistral-7b-awq-unmerged-lora
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
serve
python3 llm/client.py lmi_dist_adapters mistral-7b-awq-unmerged-lora
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
run: |
docker rm -f $(docker ps -aq) || true
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v3
with:
name: lmi-dist-lora-logs
path: tests/integration/logs/

stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, hf-handler-test, trt-llm-handler-test, trt-llm-handler-test-2, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test]
needs: [ create-runners, hf-handler-test, trt-llm-handler-test, trt-llm-handler-test-2, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test, vllm-lora-test, lmi-dist-lora-test]
steps:
- name: Stop all instances
run: |
Expand Down
1 change: 1 addition & 0 deletions engines/python/setup/djl_python/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,7 @@ def unregister_adapter(inputs: Input):
"""
adapter_name = inputs.get_property("name")
logging.info(f"Unregistering adapter {adapter_name}")
#TODO: delete in vllm engine as well
del _service.adapter_registry[adapter_name]
if not is_rolling_batch_enabled(_service.hf_configs.rolling_batch):
_service.model.base_model.delete_adapter(adapter_name)
Expand Down
52 changes: 48 additions & 4 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,27 @@ def get_model_name():
"adapters": ["english-alpaca", "portugese-alpaca", "english-alpaca"],
"tokenizer": "TheBloke/Llama-2-13B-fp16"
},
"llama2-13b-awq-unmerged-lora": {
"batch_size": [3],
"seq_length": [16, 32],
"worker": 1,
"adapters": ["french", "spanish"],
"tokenizer": "TheBloke/Llama-2-13B-fp16"
},
"mistral-7b-unmerged-lora": {
"batch_size": [3],
"seq_length": [16, 32],
"worker": 1,
"adapters": ["spanish", "german"],
"tokenizer": "mistralai/Mistral-7B-v0.1"
},
"mistral-7b-awq-unmerged-lora": {
"batch_size": [3],
"seq_length": [16, 32],
"worker": 1,
"adapters": ["spanish", "german"],
"tokenizer": "mistralai/Mistral-7B-v0.1"
},
"llama-7b-unmerged-lora-overflow": {
"max_memory_per_gpu": [15.0, 15.0],
"batch_size": [3],
Expand Down Expand Up @@ -340,6 +361,27 @@ def get_model_name():
"adapters": ["english-alpaca", "portugese-alpaca", "english-alpaca"],
"tokenizer": "TheBloke/Llama-2-13B-fp16"
},
"llama2-13b-awq-unmerged-lora": {
"batch_size": [3],
"seq_length": [16, 32],
"worker": 1,
"adapters": ["french", "spanish"],
"tokenizer": "TheBloke/Llama-2-13B-fp16"
},
"mistral-7b-unmerged-lora": {
"batch_size": [3],
"seq_length": [16, 32],
"worker": 1,
"adapters": ["spanish", "german"],
"tokenizer": "mistralai/Mistral-7B-v0.1"
},
"mistral-7b-awq-unmerged-lora": {
"batch_size": [3],
"seq_length": [16, 32],
"worker": 1,
"adapters": ["spanish", "german"],
"tokenizer": "mistralai/Mistral-7B-v0.1"
},
"llama-7b-unmerged-lora-overflow": {
"max_memory_per_gpu": [15.0, 15.0],
"batch_size": [3],
Expand Down Expand Up @@ -975,10 +1017,12 @@ def test_handler_adapters(model, model_spec):
logging.info(f"call deleted adapter {res}")
if "error" not in res:
raise RuntimeError(f"Should not work with new adapters")
res = send_json(reqs[1]).content.decode("utf-8")
logging.info(f"call valid adapter after deletion {res}")
if "error" in res:
raise RuntimeError(f"Deleting adapter breaking inference")

if len(reqs) > 1:
res = send_json(reqs[1]).content.decode("utf-8")
logging.info(f"call valid adapter after deletion {res}")
if "error" in res:
raise RuntimeError(f"Deleting adapter breaking inference")


def test_handler_rolling_batch_chat(model, model_spec):
Expand Down
Loading

0 comments on commit 030a745

Please sign in to comment.