Skip to content

Commit

Permalink
[docker] bump neuron to 2.19 SDK (#2160)
Browse files Browse the repository at this point in the history
  • Loading branch information
tosterberg authored Jul 12, 2024
1 parent 88f84ba commit 9fbc954
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 24 deletions.
7 changes: 4 additions & 3 deletions engines/python/setup/djl_python/neuron_utils/model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from transformers import AutoModelForCausalLM, GenerationConfig
from transformers_neuronx import NeuronAutoModelForCausalLM
from transformers_neuronx.config import NeuronConfig, QuantizationConfig, ContinuousBatchingConfig, GenerationConfig as NeuronGenerationConfig
from djl_python.properties_manager.tnx_properties import TnXGenerationStrategy, TnXModelSchema
from djl_python.properties_manager.tnx_properties import TnXGenerationStrategy, TnXModelSchema, TnXMemoryLayout
from transformers_neuronx.module import save_pretrained_split
from djl_python.neuron_utils.utils import NeuronXModelAdapter, get_neuronxcc_version
from huggingface_hub import hf_hub_download
Expand Down Expand Up @@ -228,11 +228,12 @@ def get_model_specific_kwargs(self) -> dict:
]
elif self.config.context_length_estimate != [
self.config.n_positions
]:
] and self.config.cache_layout == TnXMemoryLayout.LAYOUT_BSH:
raise RuntimeError(
f"context_length_estimate {self.config.context_length_estimate}"
f" need to be the same as n_positions {self.config.n_positions}"
f" You can also unset option.context_length_estimate to make continuous batching to work"
f" when using alternative cache layouts,"
f" you can always unset cache_layout to support multi bucketing w/ continuous batching."
)
return model_kwargs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,4 @@ def generate_tokens(
if unfinished_sequences.max() == 0:
break

# stop if we exceed the maximum length
if selector.stopping_criteria(input_ids, None):
break

return input_ids
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def _preprocess_prefill(self,

prefill_slots = []
for request in new_requests:
slot = empty_slots.pop()
slot = empty_slots.pop(0)
slot.assign(request, self.model.generation_config, self.tokenizer,
self.acceptor)
prefill_slots.append(slot)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def accept_speculated_tokens(self, *args, **kwargs):

@property
def stopped(self) -> bool:
return self._selector.stopping_criteria(self._tokens, None)
return self._selector.stopping_criteria(self._tokens.view(1, -1), None)

@property
def tokens(self) -> torch.LongTensor:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def create(
generation_config.pad_token_id = eos_token_id if isinstance(
eos_token_id, int) else eos_token_id[0]

generation_mode = model._get_generation_mode(generation_config, None)
generation_mode = generation_config.get_generation_mode()
if generation_mode not in [
GenerationMode.GREEDY_SEARCH, GenerationMode.SAMPLE
]:
Expand Down
20 changes: 10 additions & 10 deletions serving/docker/pytorch-inf2.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,21 @@
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
FROM ubuntu:22.04
ARG djl_version=0.28.0~SNAPSHOT
ARG djl_version=0.29.0~SNAPSHOT
ARG torch_version=2.1.2
ARG torchvision_version=0.16.2
ARG python_version=3.10
ARG neuronsdk_version=2.18.2
ARG torch_neuronx_version=2.1.2.2.1.0
ARG transformers_neuronx_version=0.10.0.360
ARG neuronx_distributed_version=0.7.0
ARG neuronx_cc_version=2.13.72.0
ARG neuronsdk_version=2.19.0
ARG torch_neuronx_version=2.1.2.2.2.0
ARG transformers_neuronx_version=0.11.351
ARG neuronx_distributed_version=0.8.0
ARG neuronx_cc_version=2.14.213.0
ARG protobuf_version=3.19.6
ARG transformers_version=4.36.2
ARG accelerate_version=0.23.0
ARG diffusers_version=0.26.1
ARG transformers_version=4.41.1
ARG accelerate_version=0.29.2
ARG diffusers_version=0.28.2
ARG pydantic_version=2.6.1
ARG optimum_neuron_version=0.0.22
ARG optimum_neuron_version=0.0.23
# %2B is the url escape for the '+' character
ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-0.4.2%2Bnightly-py3-none-any.whl"
EXPOSE 8080
Expand Down
8 changes: 4 additions & 4 deletions serving/docker/scripts/install_inferentia2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" >/etc
curl -L https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/releasecontent.html#inf2-packages
apt-get update -y && apt-get install -y aws-neuronx-collectives=2.20.22.0* \
aws-neuronx-runtime-lib=2.20.22.0* \
aws-neuronx-tools=2.17.1.0
apt-get update -y && apt-get install -y aws-neuronx-collectives=2.21.46.0* \
aws-neuronx-runtime-lib=2.21.41.0* \
aws-neuronx-tools=2.18.3.0

# TODO: Remove this hack after aws-neuronx-dkms install no longer throws an error, this bypasses the `set -ex`
# exit criteria. The package is installed and functional after running, just throws an error on install.
apt-get install -y aws-neuronx-dkms=2.16.7.0 || echo "Installed aws-neuronx-dkms with errors"
apt-get install -y aws-neuronx-dkms=2.17.17.0 || echo "Installed aws-neuronx-dkms with errors"

export PATH=/opt/aws/neuron/bin:$PATH

0 comments on commit 9fbc954

Please sign in to comment.