Inferentia2 integration tests #656
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Inferentia2 integration tests | |
on: | |
workflow_dispatch: | |
inputs: | |
djl-version: | |
description: 'The released version of DJL' | |
required: false | |
default: '' | |
schedule: | |
- cron: '0 15 * * *' | |
jobs: | |
create-runners: | |
runs-on: [self-hosted, scheduler] | |
steps: | |
- name: Create new Inf2.24xl instance | |
id: create_inf2 | |
run: | | |
cd /home/ubuntu/djl_benchmark_script/scripts | |
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \ | |
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \ | |
--fail \ | |
| jq '.token' | tr -d '"' ) | |
./start_instance.sh action_inf2 $token djl-serving | |
outputs: | |
inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }} | |
transformers-neuronx-container-unit-tests: | |
runs-on: [ self-hosted, inf2 ] | |
timeout-minutes: 15 | |
needs: create-runners | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Clean env | |
run: | | |
yes | docker system prune -a --volumes | |
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ | |
echo "wait dpkg lock..." | |
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done | |
- name: Set up Python3 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.10.x' | |
- name: Install pip dependencies | |
run: pip3 install requests numpy pillow | |
- name: Build container name | |
run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }} | |
- name: Download models and dockers | |
run: | | |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG | |
- name: Run djl_python unit/integration tests on container | |
run: | | |
# Setup | |
mkdir logs | |
if [[ "$UID" == "1000" ]]; then uid_mapping="-u djl"; fi | |
# Run test suite | |
docker run -t --rm --network="host" \ | |
--name neuron-test \ | |
-v $PWD/:/opt/ml/model/ \ | |
-w /opt/ml/model \ | |
--device=/dev/neuron0:/dev/neuron0 \ | |
deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG \ | |
/bin/bash -c "'pip install /opt/ml/model/engines/python/setup/. pytest' && \ | |
pytest engines/python/setup/djl_python/tests/neuron_test_scripts/ | tee logs/results.log" | |
# Fail on failed tests | |
if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi | |
- name: On fail step | |
if: ${{ failure() }} | |
run: | | |
cat logs/results.log | |
- name: Upload test logs | |
uses: actions/upload-artifact@v3 | |
with: | |
name: transformers-neuronx-${{ matrix.arch }}-logs | |
path: logs/ | |
transformers-neuronx-test-1: | |
runs-on: [ self-hosted, inf2 ] | |
timeout-minutes: 90 | |
needs: [ create-runners, transformers-neuronx-container-unit-tests ] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Clean env | |
run: | | |
yes | docker system prune -a --volumes | |
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ | |
echo "wait dpkg lock..." | |
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done | |
- name: Set up Python3 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.10.x' | |
- name: Install pip dependencies | |
run: pip3 install requests numpy pillow | |
- name: Build container name | |
run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }} | |
- name: Download models and dockers | |
working-directory: tests/integration | |
run: | | |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG | |
mkdir logs | |
./download_models.sh pytorch-inf2 | |
- name: Test Pytorch model | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \ | |
serve -m test::PyTorch:nc0=file:/opt/ml/model/resnet18_inf2_2_4.tar.gz | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: Test Python mode | |
working-directory: tests/integration | |
run: | | |
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models pytorch-inf2-1 \ | |
serve -m test::Python:nc0=file:/opt/ml/model/resnet18_no_reqs_inf2_2_4.tar.gz | |
./test_client.sh image/jpg models/kitten.jpg | |
docker rm -f $(docker ps -aq) | |
- name: On fail step | |
if: ${{ failure() }} | |
working-directory: tests/integration | |
run: | | |
if [ -d models ]; then sudo rm -rf models; fi | |
cat logs/serving.log | |
- name: Upload test logs | |
uses: actions/upload-artifact@v3 | |
with: | |
name: transformers-neuronx-${{ matrix.arch }}-logs | |
path: tests/integration/logs/ | |
stop-runners: | |
if: always() | |
runs-on: [ self-hosted, scheduler ] | |
needs: [ create-runners, transformers-neuronx-container-unit-tests, transformers-neuronx-test-1 ] | |
steps: | |
- name: Stop all instances | |
run: | | |
cd /home/ubuntu/djl_benchmark_script/scripts | |
instance_id=${{ needs.create-runners.outputs.inf2_instance_id_1 }} | |
./stop_instance.sh $instance_id |