Skip to content

Commit

Permalink
[tnx] refactor neuron testing
Browse files Browse the repository at this point in the history
  • Loading branch information
tosterberg committed May 24, 2024
1 parent 030a745 commit 921b71d
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 8 deletions.
54 changes: 53 additions & 1 deletion .github/workflows/llm_inf2_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,53 @@ jobs:
inf2_instance_id_1: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}
inf2_instance_id_2: ${{ steps.create_inf2_2.outputs.action_inf2_instance_id }}

transformers-neuronx-container-unit-tests:
runs-on: [ self-hosted, inf2 ]
timeout-minutes: 15
needs: create-runners
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install requests numpy pillow
- name: Build container name
run: ./serving/docker/scripts/docker_name_builder.sh pytorch-inf2 ${{ github.event.inputs.djl-version }}
- name: Download models and dockers
run: |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
- name: Run djl_python unit/integration tests on container
run: |
mkdir logs
docker run -t --rm --network="host" \
--name neuron-test \
-v $PWD/:/opt/ml/model/ \
-w /opt/ml/model \
--device=/dev/neuron0:/dev/neuron0 \
deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG \
/bin/bash -c "'pip install /opt/ml/model/engines/python/setup/. pytest' && \
pytest engines/python/setup/djl_python/tests/neuron_test_scripts/ | tee logs/results.log"
sudo rm -rf TinyLlama
if grep -F "failed" logs/results.log &>/dev/null; then exit 1; fi
- name: On fail step
if: ${{ failure() }}
run: |
cat logs/results.log
- name: Upload test logs
uses: actions/upload-artifact@v3
with:
name: transformers-neuronx-${{ matrix.arch }}-logs
path: logs/

transformers-neuronx-test-1:
runs-on: [ self-hosted, inf2 ]
timeout-minutes: 90
Expand Down Expand Up @@ -162,6 +209,7 @@ jobs:
# checking whether requirements.txt download is successful
if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
then echo "requirements.txt install was successful"; else exit 1; fi
if [ -d models ]; then sudo rm -rf models; fi
- name: Test gpt2-quantize partition
working-directory: tests/integration
run: |
Expand All @@ -181,10 +229,12 @@ jobs:
# checking whether requirements.txt download is successful
if grep -F "pip install requirements succeed!" partition_output.log &>/dev/null; \
then echo "requirements.txt install was successful"; else exit 1; fi
if [ -d models ]; then sudo rm -rf models; fi
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
run: |
if [ -d models ]; then sudo rm -rf models; fi
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v3
Expand Down Expand Up @@ -277,6 +327,7 @@ jobs:
if: ${{ failure() }}
working-directory: tests/integration
run: |
if [ -d models ]; then sudo rm -rf models; fi
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v3
Expand Down Expand Up @@ -382,6 +433,7 @@ jobs:
if: ${{ failure() }}
working-directory: tests/integration
run: |
if [ -d models ]; then sudo rm -rf models; fi
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v3
Expand All @@ -392,7 +444,7 @@ jobs:
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, transformers-neuronx-test-1, transformers-neuronx-test-2, transformers-neuronx-rolling-batch ]
needs: [ create-runners, transformers-neuronx-container-unit-tests, transformers-neuronx-test-1, transformers-neuronx-test-2, transformers-neuronx-rolling-batch ]
steps:
- name: Stop all instances
run: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,8 @@ def validate_rolling_batch(cls, rolling_batch: str) -> str:
if rolling_batch not in TNX_SUPPORTED_ROLLING_BATCH_TYPES:
logging.warning(
f"transformer neuronx only supports "
f"rolling batch type {TNX_SUPPORTED_ROLLING_BATCH_TYPES}."
f"choosing neuronx rolling batch automatically.")
f"rolling batch type {TNX_SUPPORTED_ROLLING_BATCH_TYPES} "
f"choosing 'tnx' rolling batch automatically.")
return 'auto'
return rolling_batch

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
try:
from djl_python.properties_manager.tnx_properties import TransformerNeuronXProperties
from djl_python.rolling_batch.vllm_rolling_batch import VLLMRollingBatch
from djl_python.tests.rolling_batch_test_scripts.generator import Generator
SKIP_TEST = False
except ImportError:
SKIP_TEST = True
Expand All @@ -46,10 +45,11 @@ class TestNeuronVLLM(unittest.TestCase):
def test_models(self):
# === Preparation ===
script_directory = os.path.dirname(os.path.abspath(__file__))
relative_path = "../../../"
relative_path = "../rolling_batch_test_scripts"
new_path = os.path.normpath(
os.path.join(script_directory, relative_path))
sys.path.append(new_path)
from djl_python.tests.rolling_batch_test_scripts.generator import Generator

# --- Models ---
model_names = [
Expand Down Expand Up @@ -148,5 +148,4 @@ def test_models(self):


if __name__ == '__main__':
c = TestNeuronVLLM()
c.test_models()
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def test_set_rolling_batch(self, params):

# Test
with self.patch_neuron_rolling_batch() as mock_rolling_batch:
self.service.set_rolling_batch()
self.service.set_rolling_batch(test_properties)

# Evaluate
self.assertEqual(mock_rolling_batch.called, expected)
Expand Down

0 comments on commit 921b71d

Please sign in to comment.