Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[docker] bump neuron to 2.19 SDK #2160

Merged
merged 1 commit into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from transformers import AutoModelForCausalLM, GenerationConfig
from transformers_neuronx import NeuronAutoModelForCausalLM
from transformers_neuronx.config import NeuronConfig, QuantizationConfig, ContinuousBatchingConfig, GenerationConfig as NeuronGenerationConfig
from djl_python.properties_manager.tnx_properties import TnXGenerationStrategy, TnXModelSchema
from djl_python.properties_manager.tnx_properties import TnXGenerationStrategy, TnXModelSchema, TnXMemoryLayout
from transformers_neuronx.module import save_pretrained_split
from djl_python.neuron_utils.utils import NeuronXModelAdapter, get_neuronxcc_version
from huggingface_hub import hf_hub_download
Expand Down Expand Up @@ -228,11 +228,12 @@ def get_model_specific_kwargs(self) -> dict:
]
elif self.config.context_length_estimate != [
self.config.n_positions
]:
] and self.config.cache_layout == TnXMemoryLayout.LAYOUT_BSH:
raise RuntimeError(
f"context_length_estimate {self.config.context_length_estimate}"
f" need to be the same as n_positions {self.config.n_positions}"
f" You can also unset option.context_length_estimate to make continuous batching to work"
f" when using alternative cache layouts,"
f" you can always unset cache_layout to support multi bucketing w/ continuous batching."
)
return model_kwargs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,4 @@ def generate_tokens(
if unfinished_sequences.max() == 0:
break

# stop if we exceed the maximum length
if selector.stopping_criteria(input_ids, None):
break

return input_ids
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def _preprocess_prefill(self,

prefill_slots = []
for request in new_requests:
slot = empty_slots.pop()
slot = empty_slots.pop(0)
slot.assign(request, self.model.generation_config, self.tokenizer,
self.acceptor)
prefill_slots.append(slot)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def accept_speculated_tokens(self, *args, **kwargs):

@property
def stopped(self) -> bool:
return self._selector.stopping_criteria(self._tokens, None)
return self._selector.stopping_criteria(self._tokens.view(1, -1), None)

@property
def tokens(self) -> torch.LongTensor:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def create(
generation_config.pad_token_id = eos_token_id if isinstance(
eos_token_id, int) else eos_token_id[0]

generation_mode = model._get_generation_mode(generation_config, None)
generation_mode = generation_config.get_generation_mode()
if generation_mode not in [
GenerationMode.GREEDY_SEARCH, GenerationMode.SAMPLE
]:
Expand Down
20 changes: 10 additions & 10 deletions serving/docker/pytorch-inf2.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,21 @@
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for
# the specific language governing permissions and limitations under the License.
FROM ubuntu:22.04
ARG djl_version=0.28.0~SNAPSHOT
ARG djl_version=0.29.0~SNAPSHOT
ARG torch_version=2.1.2
ARG torchvision_version=0.16.2
ARG python_version=3.10
ARG neuronsdk_version=2.18.2
ARG torch_neuronx_version=2.1.2.2.1.0
ARG transformers_neuronx_version=0.10.0.360
ARG neuronx_distributed_version=0.7.0
ARG neuronx_cc_version=2.13.72.0
ARG neuronsdk_version=2.19.0
ARG torch_neuronx_version=2.1.2.2.2.0
ARG transformers_neuronx_version=0.11.351
ARG neuronx_distributed_version=0.8.0
ARG neuronx_cc_version=2.14.213.0
ARG protobuf_version=3.19.6
ARG transformers_version=4.36.2
ARG accelerate_version=0.23.0
ARG diffusers_version=0.26.1
ARG transformers_version=4.41.1
ARG accelerate_version=0.29.2
ARG diffusers_version=0.28.2
ARG pydantic_version=2.6.1
ARG optimum_neuron_version=0.0.22
ARG optimum_neuron_version=0.0.23
# %2B is the url escape for the '+' character
ARG vllm_wheel="https://publish.djl.ai/neuron_vllm/vllm-0.4.2%2Bnightly-py3-none-any.whl"
EXPOSE 8080
Expand Down
8 changes: 4 additions & 4 deletions serving/docker/scripts/install_inferentia2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ echo "deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main" >/etc
curl -L https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add -

# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/releasecontent.html#inf2-packages
apt-get update -y && apt-get install -y aws-neuronx-collectives=2.20.22.0* \
aws-neuronx-runtime-lib=2.20.22.0* \
aws-neuronx-tools=2.17.1.0
apt-get update -y && apt-get install -y aws-neuronx-collectives=2.21.46.0* \
aws-neuronx-runtime-lib=2.21.41.0* \
aws-neuronx-tools=2.18.3.0

# TODO: Remove this hack after aws-neuronx-dkms install no longer throws an error, this bypasses the `set -ex`
# exit criteria. The package is installed and functional after running, just throws an error on install.
apt-get install -y aws-neuronx-dkms=2.16.7.0 || echo "Installed aws-neuronx-dkms with errors"
apt-get install -y aws-neuronx-dkms=2.17.17.0 || echo "Installed aws-neuronx-dkms with errors"

export PATH=/opt/aws/neuron/bin:$PATH
Loading