From 37908c61b65949110e07445610a4c304c2556e31 Mon Sep 17 00:00:00 2001 From: Tyler Osterberg Date: Fri, 5 Jul 2024 09:03:22 -0700 Subject: [PATCH] [docker] bump neuron to 2.19 SDK --- .../djl_python/neuron_utils/model_loader.py | 7 ++++--- .../optimum_modeling.py | 4 ---- .../optimum_neuron_scheduler.py | 2 +- .../transformers_neuronx_scheduler/slot.py | 2 +- .../token_selector.py | 2 +- serving/docker/pytorch-inf2.Dockerfile | 20 +++++++++---------- serving/docker/scripts/install_inferentia2.sh | 8 ++++---- 7 files changed, 21 insertions(+), 24 deletions(-) diff --git a/engines/python/setup/djl_python/neuron_utils/model_loader.py b/engines/python/setup/djl_python/neuron_utils/model_loader.py index eab65d71d..8f5c0a021 100644 --- a/engines/python/setup/djl_python/neuron_utils/model_loader.py +++ b/engines/python/setup/djl_python/neuron_utils/model_loader.py @@ -23,7 +23,7 @@ from transformers import AutoModelForCausalLM, GenerationConfig from transformers_neuronx import NeuronAutoModelForCausalLM from transformers_neuronx.config import NeuronConfig, QuantizationConfig, ContinuousBatchingConfig, GenerationConfig as NeuronGenerationConfig -from djl_python.properties_manager.tnx_properties import TnXGenerationStrategy, TnXModelSchema +from djl_python.properties_manager.tnx_properties import TnXGenerationStrategy, TnXModelSchema, TnXMemoryLayout from transformers_neuronx.module import save_pretrained_split from djl_python.neuron_utils.utils import NeuronXModelAdapter, get_neuronxcc_version from huggingface_hub import hf_hub_download @@ -228,11 +228,12 @@ def get_model_specific_kwargs(self) -> dict: ] elif self.config.context_length_estimate != [ self.config.n_positions - ]: + ] and self.config.cache_layout == TnXMemoryLayout.LAYOUT_BSH: raise RuntimeError( f"context_length_estimate {self.config.context_length_estimate}" f" need to be the same as n_positions {self.config.n_positions}" - f" You can also unset option.context_length_estimate to make continuous batching to work" + f" when using alternative cache layouts," + f" you can always unset cache_layout to support multi bucketing w/ continuous batching." ) return model_kwargs diff --git a/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_modeling.py b/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_modeling.py index 3d21f83a9..b6f24d4ac 100644 --- a/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_modeling.py +++ b/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_modeling.py @@ -346,8 +346,4 @@ def generate_tokens( if unfinished_sequences.max() == 0: break - # stop if we exceed the maximum length - if selector.stopping_criteria(input_ids, None): - break - return input_ids diff --git a/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py b/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py index 0cdff91ec..61b2b6a2b 100644 --- a/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py +++ b/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py @@ -276,7 +276,7 @@ def _preprocess_prefill(self, prefill_slots = [] for request in new_requests: - slot = empty_slots.pop() + slot = empty_slots.pop(0) slot.assign(request, self.model.generation_config, self.tokenizer, self.acceptor) prefill_slots.append(slot) diff --git a/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py b/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py index 0601773f4..1fb90493f 100644 --- a/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py +++ b/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py @@ -250,7 +250,7 @@ def accept_speculated_tokens(self, *args, **kwargs): @property def stopped(self) -> bool: - return self._selector.stopping_criteria(self._tokens, None) + return self._selector.stopping_criteria(self._tokens.view(1, -1), None) @property def tokens(self) -> torch.LongTensor: diff --git a/engines/python/setup/djl_python/transformers_neuronx_scheduler/token_selector.py b/engines/python/setup/djl_python/transformers_neuronx_scheduler/token_selector.py index 51408658a..c4d88ce1c 100644 --- a/engines/python/setup/djl_python/transformers_neuronx_scheduler/token_selector.py +++ b/engines/python/setup/djl_python/transformers_neuronx_scheduler/token_selector.py @@ -161,7 +161,7 @@ def create( generation_config.pad_token_id = eos_token_id if isinstance( eos_token_id, int) else eos_token_id[0] - generation_mode = model._get_generation_mode(generation_config, None) + generation_mode = generation_config.get_generation_mode() if generation_mode not in [ GenerationMode.GREEDY_SEARCH, GenerationMode.SAMPLE ]: diff --git a/serving/docker/pytorch-inf2.Dockerfile b/serving/docker/pytorch-inf2.Dockerfile index 75073b976..079e9024e 100644 --- a/serving/docker/pytorch-inf2.Dockerfile +++ b/serving/docker/pytorch-inf2.Dockerfile @@ -10,21 +10,21 @@ # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for # the specific language governing permissions and limitations under the License. FROM ubuntu:22.04 -ARG djl_version=0.28.0~SNAPSHOT +ARG djl_version=0.29.0~SNAPSHOT ARG torch_version=2.1.2 ARG torchvision_version=0.16.2 ARG python_version=3.10 -ARG neuronsdk_version=2.18.2 -ARG torch_neuronx_version=2.1.2.2.1.0 -ARG transformers_neuronx_version=0.10.0.360 -ARG neuronx_distributed_version=0.7.0 -ARG neuronx_cc_version=2.13.72.0 +ARG neuronsdk_version=2.19.0 +ARG torch_neuronx_version=2.1.2.2.2.0 +ARG transformers_neuronx_version=0.11.351 +ARG neuronx_distributed_version=0.8.0 +ARG neuronx_cc_version=2.14.213.0 ARG protobuf_version=3.19.6 -ARG transformers_version=4.36.2 -ARG accelerate_version=0.23.0 -ARG diffusers_version=0.26.1 +ARG transformers_version=4.41.1 +ARG accelerate_version=0.29.2 +ARG diffusers_version=0.28.2 ARG pydantic_version=2.6.1 -ARG optimum_neuron_version=0.0.22 +ARG optimum_neuron_version=0.0.23 # %2B is the url escape for the '+' character ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-0.4.2%2Bnightly-py3-none-any.whl" EXPOSE 8080 diff --git a/serving/docker/scripts/install_inferentia2.sh b/serving/docker/scripts/install_inferentia2.sh index f0396b1d6..cb07ab3de 100755 --- a/serving/docker/scripts/install_inferentia2.sh +++ b/serving/docker/scripts/install_inferentia2.sh @@ -15,12 +15,12 @@ echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" >/etc curl -L https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/releasecontent.html#inf2-packages -apt-get update -y && apt-get install -y aws-neuronx-collectives=2.20.22.0* \ - aws-neuronx-runtime-lib=2.20.22.0* \ - aws-neuronx-tools=2.17.1.0 +apt-get update -y && apt-get install -y aws-neuronx-collectives=2.21.46.0* \ + aws-neuronx-runtime-lib=2.21.41.0* \ + aws-neuronx-tools=2.18.3.0 # TODO: Remove this hack after aws-neuronx-dkms install no longer throws an error, this bypasses the `set -ex` # exit criteria. The package is installed and functional after running, just throws an error on install. -apt-get install -y aws-neuronx-dkms=2.16.7.0 || echo "Installed aws-neuronx-dkms with errors" +apt-get install -y aws-neuronx-dkms=2.17.17.0 || echo "Installed aws-neuronx-dkms with errors" export PATH=/opt/aws/neuron/bin:$PATH