From 25a8e05f3d0713bf65a596cfdf6182e29e44f7a7 Mon Sep 17 00:00:00 2001 From: Zach Kimberg Date: Wed, 12 Jun 2024 11:54:37 -0700 Subject: [PATCH] [CI] Inferentia tests through pytest --- .github/workflows/llm_integration.yml | 169 ++++++++++++++++++++------ tests/integration/tests.py | 158 +++++++++++++++++++++++- tests/test_client.py | 6 +- 3 files changed, 290 insertions(+), 43 deletions(-) diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index 02783f2393..705df07e80 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -15,56 +15,92 @@ jobs: create-runners: runs-on: [self-hosted, scheduler] steps: - - name: Create new G6 instance - id: create_gpu +# - name: Create new G6 instance +# id: create_gpu +# run: | +# cd /home/ubuntu/djl_benchmark_script/scripts +# token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ +# https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ +# --fail \ +# | jq '.token' | tr -d '"' ) +# ./start_instance.sh action_g6 $token djl-serving +# - name: Create new G6 instance +# id: create_gpu2 +# run: | +# cd /home/ubuntu/djl_benchmark_script/scripts +# token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ +# https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ +# --fail \ +# | jq '.token' | tr -d '"' ) +# ./start_instance.sh action_g6 $token djl-serving +# - name: Create new G6 instance +# id: create_gpu3 +# run: | +# cd /home/ubuntu/djl_benchmark_script/scripts +# token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ +# https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ +# --fail \ +# | jq '.token' | tr -d '"' ) +# ./start_instance.sh action_g6 $token djl-serving + - name: Create new Inf2.24xl instance + id: create_inf2 run: | cd /home/ubuntu/djl_benchmark_script/scripts token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ --fail \ | jq '.token' | tr -d '"' ) - ./start_instance.sh action_g6 $token djl-serving - - name: Create new G6 instance - id: create_gpu2 + ./start_instance.sh action_inf2 $token djl-serving + - name: Create new Inf2.24xl instance + id: create_inf2_2 run: | cd /home/ubuntu/djl_benchmark_script/scripts token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ --fail \ | jq '.token' | tr -d '"' ) - ./start_instance.sh action_g6 $token djl-serving - - name: Create new G6 instance - id: create_gpu3 - run: | - cd /home/ubuntu/djl_benchmark_script/scripts - token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ - https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ - --fail \ - | jq '.token' | tr -d '"' ) - ./start_instance.sh action_g6 $token djl-serving + ./start_instance.sh action_inf2 $token djl-serving outputs: - gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }} - gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }} - gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }} +# gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }} +# gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }} +# gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }} + inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }} + inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }} test: - runs-on: [ self-hosted, g6 ] + runs-on: [ self-hosted, ${{ matrix.test.instance}} ] timeout-minutes: 60 needs: create-runners strategy: fail-fast: false matrix: test: - - TestHfHandler - - TestTrtLlmHandler1 - - TestTrtLlmHandler2 - - TestSchedulerSingleGPU - - TestSchedulerMultiGPU - - TestLmiDist1 - - TestLmiDist2 - - TestVllm1 - - TestVllmLora - - TestLmiDistLora +# - test: TestHfHandler +# instance: g6 +# - test: TestTrtLlmHandler1 +# instance: g6 +# - test: TestTrtLlmHandler2 +# instance: g6 +# - test: TestSchedulerSingleGPU +# instance: g6 +# - test: TestSchedulerMultiGPU +# instance: g6 +# - test: TestLmiDist1 +# instance: g6 +# - test: TestLmiDist2 +# instance: g6 +# - test: TestVllm1 +# instance: g6 +# - test: TestVllmLora +# instance: g6 +# - test: TestLmiDistLora +# instance: g6 + - test: TestNeuronx1 + instance: inf2 + - test: TestNeuronx2 + instance: inf2 + - test: TestNeuronxRollingBatch + instance: inf2 steps: - uses: actions/checkout@v4 - name: Clean env @@ -78,7 +114,7 @@ jobs: with: python-version: '3.10.x' - name: Install pip dependencies - run: pip3 install pytest requests numpy huggingface_hub + run: pip3 install pytest requests numpy pillow huggingface_hub - name: Install awscurl working-directory: tests/integration run: | @@ -90,7 +126,7 @@ jobs: env: TEST_DJL_VERSION: ${{ inputs.djl-version }} run: | - pytest -k ${{ matrix.test }} tests.py + pytest -k ${{ matrix.test.test }} tests.py - name: Cleanup working-directory: tests/integration run: | @@ -108,20 +144,79 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: test-${{ matrix.test }}-logs + name: test-${{ matrix.test.test }}-logs path: tests/integration/all_logs/ + transformers-neuronx-container-unit-tests: + runs-on: [ self-hosted, inf2 ] + timeout-minutes: 15 + needs: create-runners + steps: + - uses: actions/checkout@v4 + - name: Clean env + run: | + yes | docker system prune -a --volumes + sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ + echo "wait dpkg lock..." + while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done + - name: Set up Python3 + uses: actions/setup-python@v5 + with: + python-version: '3.10.x' + - name: Install pip dependencies + run: pip3 install requests numpy pillow wheel + - name: Build container name + run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }} + - name: Download models and dockers + run: | + docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG + - name: Run djl_python unit/integration tests on container + working-directory: engines/python/setup + run: | + # Setup + pip install setuptools + python3 -m setup bdist_wheel + mkdir logs + docker run -t --rm --network="host" \ + --name neuron-test \ + -v $PWD/:/opt/ml/model/ \ + -w /opt/ml/model \ + --device=/dev/neuron0:/dev/neuron0 \ + deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG \ + /bin/bash -c "'pip install /opt/ml/model/dist/*.whl pytest' && \ + pytest djl_python/tests/neuron_test_scripts/ | tee logs/results.log" + + # Cleanup + sudo rm -rf TinyLlama .pytest_cache djl_python + + # Fail on failed tests + if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi + - name: On fail step + if: ${{ failure() }} + working-directory: engines/python/setup + run: | + cat logs/results.log + - name: Upload test logs + uses: actions/upload-artifact@v3 + with: + name: transformers-neuronx-${{ matrix.arch }}-logs + path: engines/python/setup/logs/ + stop-runners: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners, test] + needs: [ create-runners, test, transformers-neuronx-container-unit-tests] steps: - name: Stop all instances run: | cd /home/ubuntu/djl_benchmark_script/scripts - instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }} - ./stop_instance.sh $instance_id - instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }} +# instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }} +# ./stop_instance.sh $instance_id +# instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }} +# ./stop_instance.sh $instance_id +# instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }} +# ./stop_instance.sh $instance_id + instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }} ./stop_instance.sh $instance_id - instance_id=${{ needs.create-runners.outputs.gpu_instance_id_3 }} + instance_id=${{ needs.create-runners.outputs.inf2_instance_id_2 }} ./stop_instance.sh $instance_id diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 089de8d969..e83afc8243 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -2,16 +2,18 @@ import os import subprocess +import pytest import llm.prepare as prepare import llm.client as client import rb_client as rb_client +import test_client djl_version = os.environ.get('TEST_DJL_VERSION', '').strip() class Runner: - def __init__(self, container, test_name=None): + def __init__(self, container, test_name=None, download=False): self.container = container self.test_name = test_name @@ -26,6 +28,10 @@ def __init__(self, container, test_name=None): self.image = f"deepjavalibrary/djl-serving:{flavor}" + if download: + if not os.path.exists("models"): + os.system(f"./download_models.sh {self.container}") + def __enter__(self): # os.system(f'docker pull {self.image}') os.system('rm -rf models') @@ -39,16 +45,22 @@ def __exit__(self, *args): subprocess.run(["./remove_container.sh"], check=True) os.system("cat logs/serving.log") - def launch(self, env_vars=None): + def launch(self, env_vars=None, model=None, cmd=None): if env_vars is not None: with open("docker_env", "w") as f: f.write(env_vars) + if model is None: + model = 'test=file:/opt/ml/model/test/' + + if cmd is None: + cmd = f'serve -m {self.container}' + model_dir = os.path.join(os.getcwd(), 'models') - subprocess.run( - f'./launch_container.sh {self.image} {model_dir} {self.container} serve -m test=file:/opt/ml/model/test/' + return subprocess.run( + f'./launch_container.sh {self.image} {model_dir} {container} {cmd}' .split(), - check=True) + check=True, capture_output=True) class TestHfHandler: @@ -424,3 +436,139 @@ def test_lora_llama3_8b(self): prepare.build_lmi_dist_model("llama3-8b-unmerged-lora") r.launch() client.run("lmi_dist_adapters llama3-8b-unmerged-lora".split()) + + +class TestNeuronx1: + # Runs on inf2.24xl + + def test_python_mode(self): + with Runner('pytorch-inf2', 'test_python_mode', download=True) as r: + r.launch(model='test::PyTorch:nc0=file:/opt/ml/model/resnet18_inf2_2_4.tar.gz', container='pytorch-inf2-1') + test_client.run() + + def test_gpt2(self): + with Runner('pytorch-inf2', 'gpt2') as r: + prepare.build_transformers_neuronx_handler_model("gpt2") + r.launch(cmd='serve -m pytorch-inf2-1') + client.run("transformers_neuronx gpt2".split()) + + def test_gpt2_quantize(self): + with Runner('pytorch-inf2', 'gpt2-quantize') as r: + prepare.build_transformers_neuronx_handler_model("gpt2-quantize") + r.launch(cmd='serve -m pytorch-inf2-1') + client.run("transformers_neuronx gpt2-quantize".split()) + + def test_opt_1_3b(self): + with Runner('pytorch-inf2', 'opt-1.3b') as r: + prepare.build_transformers_neuronx_handler_model("opt-1.3b") + r.launch(cmd='serve -m pytorch-inf2-6') + client.run("transformers_neuronx opt-1.3b".split()) + + def test_gpt_j_6b(self): + with Runner('pytorch-inf2', 'gpt-j-6b') as r: + prepare.build_transformers_neuronx_handler_model("gpt-j-6b") + r.launch(cmd='serve -m pytorch-inf2-6') + client.run("transformers_neuronx gpt-j-6b".split()) + + def test_pythia(self): + with Runner('pytorch-inf2', 'pythia-2.8b') as r: + prepare.build_transformers_neuronx_handler_model("pythia-2.8b") + r.launch(cmd='serve -m pytorch-inf2-2') + client.run("transformers_neuronx pythia-2.8b".split()) + + def test_bloom(self): + with Runner('pytorch-inf2', 'bloom-7b1') as r: + prepare.build_transformers_neuronx_handler_model("bloom-7b1") + r.launch(cmd='serve -m pytorch-inf2-2') + client.run("transformers_neuronx bloom-7b1".split()) + + @pytest.mark.parametrize("model", ["gpt2", "gpt2-quantize"]) + def test_partition(self, model): + with Runner('pytorch-inf2', f'partition-{model}') as r: + prepare.build_transformers_neuronx_handler_model(model) + with open("models/test/requirements.txt", "a") as f: + f.write("dummy_test") + partition_output = r.launch(cmd='partition --model-dir /opt/ml/input/data/training/ --skip-copy') + + # Check if neff files are generated + if len([fn for fn in os.listdir("models/test/partition-test/compiled") if fn.endswith(".neff")]) == 0: + raise Exception("Failed to generate any .neff files") + + # Check whether requirements.txt download is sufficient + if 'pip install requirements succeed!' not in partition_output.stdout.decode("utf-8"): + raise Exception("Requirements.txt not installed successfully") + + +class TestNeuronx2: + # Runs on inf2.24xl + + def test_stream_opt(self): + with Runner('pytorch-inf2', 'opt-1.3b-streaming') as r: + prepare.build_transformers_neuronx_handler_model("opt-1.3b-streaming") + r.launch(cmd='serve -m pytorch-inf2-6') + client.run("transformers_neuronx opt-1.3b-streaming".split()) + + def test_mistral(self): + with Runner('pytorch-inf2', 'mistral-7b') as r: + prepare.build_transformers_neuronx_handler_model("mistral-7b") + r.launch(cmd='serve -m pytorch-inf2-2') + client.run("transformers_neuronx mistral-7b".split()) + + def test_stable_diffusion_1_5(self): + with Runner('pytorch-inf2', 'stable-diffusion-1.5-neuron') as r: + prepare.build_transformers_neuronx_handler_model("stable-diffusion-1.5-neuron") + r.launch(cmd='serve -m pytorch-inf2-2') + client.run("neuron-stable-diffusion stable-diffusion-1.5-neuron".split()) + + def test_stable_diffusion_2_1(self): + with Runner('pytorch-inf2', 'stable-diffusion-2.1-neuron') as r: + prepare.build_transformers_neuronx_handler_model("stable-diffusion-2.1-neuron") + r.launch(cmd='serve -m pytorch-inf2-2') + client.run("neuron-stable-diffusion stable-diffusion-2.1-neuron".split()) + + def test_stable_diffusion_xl(self): + with Runner('pytorch-inf2', 'stable-diffusion-xl-neuron') as r: + prepare.build_transformers_neuronx_handler_model("stable-diffusion-xl-neuron") + r.launch(cmd='serve -m pytorch-inf2-2') + client.run("neuron-stable-diffusion stable-diffusion-xl-neuron".split()) + + +class TestNeuronxRollingBatch: + # Runs on inf2.24xl + + def test_llama_7b(self): + with Runner('pytorch-inf2', 'llama-7b-rb') as r: + prepare.build_transformers_neuronx_handler_model("llama-7b-rb") + r.launch(cmd='serve -m pytorch-inf2-2') + client.run("transformers_neuronx_rolling_batch llama-7b-rb".split()) + + def test_tiny_llama_vllm(self): + with Runner('pytorch-inf2', 'tiny-llama-rb-vllm') as r: + prepare.build_transformers_neuronx_handler_model("tiny-llama-rb-vllm") + r.launch(cmd='serve -m pytorch-inf2-1') + client.run("transformers_neuronx_rolling_batch tiny-llama-rb-vllm".split()) + + def test_llama3_vllm(self): + with Runner('pytorch-inf2', 'llama-3-8b-rb-vllm') as r: + prepare.build_transformers_neuronx_handler_model("llama-3-8b-rb-vllm") + r.launch(cmd='serve -m pytorch-inf2-4') + client.run("transformers_neuronx_rolling_batch llama-3-8b-rb-vllm".split()) + + def test_mixtral(self): + with Runner('pytorch-inf2', 'mixtral-8x7b-rb') as r: + prepare.build_transformers_neuronx_handler_model("mixtral-8x7b-rb") + r.launch(cmd='serve -m pytorch-inf2-4') + client.run("transformers_neuronx_rolling_batch mixtral-8x7b-rb".split()) + + def test_llama_speculative(self): + with Runner('pytorch-inf2', 'llama-speculative-rb') as r: + prepare.build_transformers_neuronx_handler_model("llama-speculative-rb") + r.launch(cmd='serve -m pytorch-inf2-6') + client.run("transformers_neuronx_rolling_batch llama-speculative-rb".split()) + + def test_llama_speculative_compiled(self): + with Runner('pytorch-inf2', 'llama-speculative-compiled-rb') as r: + prepare.build_transformers_neuronx_handler_model("llama-speculative-compiled-rb") + r.launch(cmd='serve -m pytorch-inf2-6') + client.run("transformers_neuronx_rolling_batch llama-speculative-compiled-rb".split()) + diff --git a/tests/test_client.py b/tests/test_client.py index f5da2480ff..70289959a0 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -67,5 +67,9 @@ def test_image(self): self.assertEqual(res.content, img.tobytes()) -if __name__ == '__main__': +def run(): unittest.main(verbosity=2) + + +if __name__ == '__main__': + run()