diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index 076b68bdc..17e6b2253 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -8,7 +8,7 @@ on: required: false default: '' run_test: - description: 'Run only the tests you need [ hf, trtllm, scheduler, lmi-dist, vllm ]' + description: 'Run only the tests you need [ hf, trtllm, scheduler, lmi-dist, vllm, vllm-lora, lmi-dist-lora ]' required: false default: '' schedule: @@ -566,24 +566,6 @@ jobs: serve -m test=file:/opt/ml/model/test/ python3 llm/client.py lmi_dist mpt-7b docker rm -f $(docker ps -aq) - - name: Test lmi-dist unmerged lora - llama7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test lmi-dist unmerged lora overflow - llama7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora-overflow - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora-overflow - docker rm -f $(docker ps -aq) - name: On fail step if: ${{ failure() }} working-directory: tests/integration @@ -802,6 +784,53 @@ jobs: serve -m test=file:/opt/ml/model/test/ python3 llm/client.py vllm gemma-7b docker rm -f $(docker ps -aq) + - name: Test llama2-7b-chat + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py vllm llama2-7b-chat + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve -m test=file:/opt/ml/model/test/ + python3 llm/client.py vllm_chat llama2-7b-chat + docker rm -f $(docker ps -aq) + - name: On fail step + if: ${{ failure() }} + working-directory: tests/integration + run: | + docker rm -f $(docker ps -aq) || true + cat logs/serving.log + - name: Upload test logs + uses: actions/upload-artifact@v3 + with: + name: vllm-logs + path: tests/integration/logs/ + + + vllm-lora-test: + if: contains(fromJson('["", "vllm-lora"]'), github.event.inputs.run_test) + runs-on: [ self-hosted, g5 ] + timeout-minutes: 60 + needs: create-runners + steps: + - uses: actions/checkout@v4 + - name: Clean env + run: | + yes | docker system prune -a --volumes + sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ + echo "wait dpkg lock..." + while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done + - name: Set up Python3 + uses: actions/setup-python@v5 + with: + python-version: '3.10.x' + - name: Install pip dependencies + run: pip3 install requests numpy huggingface_hub + - name: Build container name + run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} + - name: Download docker + working-directory: tests/integration + run: | + docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - name: Test vllm unmerged lora - llama7b working-directory: tests/integration run: | @@ -820,14 +849,32 @@ jobs: serve python3 llm/client.py vllm_adapters llama-7b-unmerged-lora-overflow docker rm -f $(docker ps -aq) - - name: Test llama2-7b-chat + - name: Test vllm lora awq - llama2-13b working-directory: tests/integration run: | rm -rf models - python3 llm/prepare.py vllm llama2-7b-chat + python3 llm/prepare.py vllm llama2-13b-awq-unmerged-lora ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm_chat llama2-7b-chat + serve + python3 llm/client.py vllm_adapters llama2-13b-awq-unmerged-lora + docker rm -f $(docker ps -aq) + - name: Test vllm lora - mistral-7b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py vllm mistral-7b-unmerged-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve + python3 llm/client.py vllm_adapters mistral-7b-unmerged-lora + docker rm -f $(docker ps -aq) + - name: Test vllm lora awq - mistral-7b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py vllm mistral-7b-awq-unmerged-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve + python3 llm/client.py vllm_adapters mistral-7b-awq-unmerged-lora docker rm -f $(docker ps -aq) - name: On fail step if: ${{ failure() }} @@ -838,13 +885,95 @@ jobs: - name: Upload test logs uses: actions/upload-artifact@v3 with: - name: vllm-logs + name: vllm-lora-logs + path: tests/integration/logs/ + + lmi-dist-lora-test: + if: contains(fromJson('["", "lmi-dist-lora"]'), github.event.inputs.run_test) + runs-on: [ self-hosted, g5 ] + timeout-minutes: 60 + needs: create-runners + steps: + - uses: actions/checkout@v4 + - name: Clean env + run: | + yes | docker system prune -a --volumes + sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ + echo "wait dpkg lock..." + while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done + - name: Set up Python3 + uses: actions/setup-python@v5 + with: + python-version: '3.10.x' + - name: Install pip dependencies + run: pip3 install requests numpy huggingface_hub + - name: Build container name + run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} + - name: Download docker + working-directory: tests/integration + run: | + docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG + - name: Test lmi-dist unmerged lora - llama7b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve + python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora + docker rm -f $(docker ps -aq) + - name: Test lmi-dist unmerged lora overflow - llama7b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora-overflow + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve + python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora-overflow + docker rm -f $(docker ps -aq) + - name: Test lmi-dist lora awq - llama2-13b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py lmi_dist llama2-13b-awq-unmerged-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve + python3 llm/client.py lmi_dist_adapters llama2-13b-awq-unmerged-lora + docker rm -f $(docker ps -aq) + - name: Test lmi-dist lora - mistral-7b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py lmi_dist mistral-7b-unmerged-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve + python3 llm/client.py lmi_dist_adapters mistral-7b-unmerged-lora + docker rm -f $(docker ps -aq) + - name: Test lmi-dist lora awq - mistral-7b + working-directory: tests/integration + run: | + rm -rf models + python3 llm/prepare.py lmi_dist mistral-7b-awq-unmerged-lora + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ + serve + python3 llm/client.py lmi_dist_adapters mistral-7b-awq-unmerged-lora + docker rm -f $(docker ps -aq) + - name: On fail step + if: ${{ failure() }} + working-directory: tests/integration + run: | + docker rm -f $(docker ps -aq) || true + cat logs/serving.log + - name: Upload test logs + uses: actions/upload-artifact@v3 + with: + name: lmi-dist-lora-logs path: tests/integration/logs/ stop-runners: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners, hf-handler-test, trt-llm-handler-test, trt-llm-handler-test-2, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test] + needs: [ create-runners, hf-handler-test, trt-llm-handler-test, trt-llm-handler-test-2, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test, vllm-lora-test, lmi-dist-lora-test] steps: - name: Stop all instances run: | diff --git a/engines/python/setup/djl_python/huggingface.py b/engines/python/setup/djl_python/huggingface.py index 5e2bd4d1c..9f2cab058 100644 --- a/engines/python/setup/djl_python/huggingface.py +++ b/engines/python/setup/djl_python/huggingface.py @@ -577,6 +577,7 @@ def unregister_adapter(inputs: Input): """ adapter_name = inputs.get_property("name") logging.info(f"Unregistering adapter {adapter_name}") + #TODO: delete in vllm engine as well del _service.adapter_registry[adapter_name] if not is_rolling_batch_enabled(_service.hf_configs.rolling_batch): _service.model.base_model.delete_adapter(adapter_name) diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 463a5da19..50367446a 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -273,6 +273,27 @@ def get_model_name(): "adapters": ["english-alpaca", "portugese-alpaca", "english-alpaca"], "tokenizer": "TheBloke/Llama-2-13B-fp16" }, + "llama2-13b-awq-unmerged-lora": { + "batch_size": [3], + "seq_length": [16, 32], + "worker": 1, + "adapters": ["french", "spanish"], + "tokenizer": "TheBloke/Llama-2-13B-fp16" + }, + "mistral-7b-unmerged-lora": { + "batch_size": [3], + "seq_length": [16, 32], + "worker": 1, + "adapters": ["spanish", "german"], + "tokenizer": "mistralai/Mistral-7B-v0.1" + }, + "mistral-7b-awq-unmerged-lora": { + "batch_size": [3], + "seq_length": [16, 32], + "worker": 1, + "adapters": ["spanish", "german"], + "tokenizer": "mistralai/Mistral-7B-v0.1" + }, "llama-7b-unmerged-lora-overflow": { "max_memory_per_gpu": [15.0, 15.0], "batch_size": [3], @@ -340,6 +361,27 @@ def get_model_name(): "adapters": ["english-alpaca", "portugese-alpaca", "english-alpaca"], "tokenizer": "TheBloke/Llama-2-13B-fp16" }, + "llama2-13b-awq-unmerged-lora": { + "batch_size": [3], + "seq_length": [16, 32], + "worker": 1, + "adapters": ["french", "spanish"], + "tokenizer": "TheBloke/Llama-2-13B-fp16" + }, + "mistral-7b-unmerged-lora": { + "batch_size": [3], + "seq_length": [16, 32], + "worker": 1, + "adapters": ["spanish", "german"], + "tokenizer": "mistralai/Mistral-7B-v0.1" + }, + "mistral-7b-awq-unmerged-lora": { + "batch_size": [3], + "seq_length": [16, 32], + "worker": 1, + "adapters": ["spanish", "german"], + "tokenizer": "mistralai/Mistral-7B-v0.1" + }, "llama-7b-unmerged-lora-overflow": { "max_memory_per_gpu": [15.0, 15.0], "batch_size": [3], @@ -975,10 +1017,12 @@ def test_handler_adapters(model, model_spec): logging.info(f"call deleted adapter {res}") if "error" not in res: raise RuntimeError(f"Should not work with new adapters") - res = send_json(reqs[1]).content.decode("utf-8") - logging.info(f"call valid adapter after deletion {res}") - if "error" in res: - raise RuntimeError(f"Deleting adapter breaking inference") + + if len(reqs) > 1: + res = send_json(reqs[1]).content.decode("utf-8") + logging.info(f"call valid adapter after deletion {res}") + if "error" in res: + raise RuntimeError(f"Deleting adapter breaking inference") def test_handler_rolling_batch_chat(model, model_spec): diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index e8e9cb5f8..fb9782434 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -435,7 +435,91 @@ "option.task": "text-generation", "option.tensor_parallel_degree": 4, "option.max_rolling_batch_size": 4 - } + }, + "llama-7b-unmerged-lora": { + "option.model_id": "s3://djl-llm/huggyllama-llama-7b", + "option.tensor_parallel_degree": "max", + "option.task": "text-generation", + "option.dtype": "fp16", + "option.adapters": "adapters", + "option.enable_lora": "true", + "adapter_ids": ["tloen/alpaca-lora-7b", "22h/cabrita-lora-v0-1"], + "adapter_names": ["english-alpaca", "portugese-alpaca"], + "option.gpu_memory_utilization": "0.8", + }, + "llama2-13b-awq-unmerged-lora": { + "option.model_id": + "s3://djl-llm/TheBloke-Llama-2-13b-Chat-AWQ/", + "option.tensor_parallel_degree": + "max", + "option.task": + "text-generation", + "option.dtype": + "fp16", + "option.adapters": + "adapters", + "option.enable_lora": + "true", + "option.max_lora_rank": + 64, + "adapter_ids": [ + "UnderstandLing/llama-2-13b-chat-fr", + "UnderstandLing/llama-2-13b-chat-es" + ], + "adapter_names": ["french", "spanish"], + "option.gpu_memory_utilization": + "0.8", + }, + "mistral-7b-unmerged-lora": { + "option.model_id": + "s3://djl-llm/mistral-7b-instruct-v02/", + "option.tensor_parallel_degree": + "max", + "option.task": + "text-generation", + "option.dtype": + "fp16", + "option.adapters": + "adapters", + "option.enable_lora": + "true", + "option.max_lora_rank": + 64, + "option.max_loras": + 2, + "adapter_ids": [ + "UnderstandLing/Mistral-7B-Instruct-v0.2-es", + "UnderstandLing/Mistral-7B-Instruct-v0.2-de" + ], + "adapter_names": ["spanish", "german"], + "option.gpu_memory_utilization": + "0.8", + }, + "mistral-7b-awq-unmerged-lora": { + "option.model_id": + "s3://djl-llm/mistral-7b-instruct-v02-awq/", + "option.tensor_parallel_degree": + "max", + "option.task": + "text-generation", + "option.dtype": + "fp16", + "option.adapters": + "adapters", + "option.enable_lora": + "true", + "option.max_lora_rank": + 64, + "option.max_loras": + 2, + "adapter_ids": [ + "UnderstandLing/Mistral-7B-Instruct-v0.2-es", + "UnderstandLing/Mistral-7B-Instruct-v0.2-de" + ], + "adapter_names": ["spanish", "german"], + "option.gpu_memory_utilization": + "0.8", + }, } vllm_model_list = { @@ -490,6 +574,79 @@ "adapter_names": ["english-alpaca", "portugese-alpaca"], "option.gpu_memory_utilization": "0.8", }, + "llama2-13b-awq-unmerged-lora": { + "option.model_id": + "s3://djl-llm/TheBloke-Llama-2-13b-Chat-AWQ/", + "option.tensor_parallel_degree": + "max", + "option.task": + "text-generation", + "option.dtype": + "fp16", + "option.adapters": + "adapters", + "option.enable_lora": + "true", + "option.max_lora_rank": + 64, + "adapter_ids": [ + "UnderstandLing/llama-2-7b-chat-es", + "UnderstandLing/llama-2-7b-chat-ru" + ], + "adapter_names": ["spanish", "russian"], + "option.gpu_memory_utilization": + "0.8", + }, + "mistral-7b-unmerged-lora": { + "option.model_id": + "s3://djl-llm/mistral-7b-instruct-v02/", + "option.tensor_parallel_degree": + "max", + "option.task": + "text-generation", + "option.dtype": + "fp16", + "option.adapters": + "adapters", + "option.enable_lora": + "true", + "option.max_lora_rank": + 64, + "option.max_loras": + 2, + "adapter_ids": [ + "UnderstandLing/Mistral-7B-Instruct-v0.2-es", + "UnderstandLing/Mistral-7B-Instruct-v0.2-de" + ], + "adapter_names": ["spanish", "german"], + "option.gpu_memory_utilization": + "0.8", + }, + "mistral-7b-awq-unmerged-lora": { + "option.model_id": + "s3://djl-llm/mistral-7b-instruct-v02-awq/", + "option.tensor_parallel_degree": + "max", + "option.task": + "text-generation", + "option.dtype": + "fp16", + "option.adapters": + "adapters", + "option.enable_lora": + "true", + "option.max_lora_rank": + 64, + "option.max_loras": + 2, + "adapter_ids": [ + "UnderstandLing/Mistral-7B-Instruct-v0.2-es", + "UnderstandLing/Mistral-7B-Instruct-v0.2-de" + ], + "adapter_names": ["spanish", "german"], + "option.gpu_memory_utilization": + "0.8", + }, "llama-7b-unmerged-lora-overflow": { "option.model_id": "s3://djl-llm/huggyllama-llama-7b", "option.tensor_parallel_degree": 1,