diff --git a/serving/docker/lmi.Dockerfile b/serving/docker/lmi.Dockerfile index 899a914380..c4d9d5dee4 100644 --- a/serving/docker/lmi.Dockerfile +++ b/serving/docker/lmi.Dockerfile @@ -16,7 +16,7 @@ ARG djl_version ARG djl_serving_version ARG python_version=3.11 ARG djl_torch_version=2.5.1 -ARG djl_onnx_version=1.19.0 +ARG djl_onnx_version=1.20.0 EXPOSE 8080 @@ -81,6 +81,7 @@ RUN apt-get update && apt-get install -yq libaio-dev libopenmpi-dev g++ unzip cu COPY requirements-lmi.txt ./requirements.txt RUN pip3 install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124 && pip3 cache purge RUN pip3 install -r requirements.txt \ + && pip3 install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps \ && git clone https://github.com/neuralmagic/AutoFP8.git \ && cd AutoFP8 \ && git reset --hard 4b2092c \ diff --git a/serving/docker/requirements-lmi.txt b/serving/docker/requirements-lmi.txt index 86d9b3adce..38f492378c 100644 --- a/serving/docker/requirements-lmi.txt +++ b/serving/docker/requirements-lmi.txt @@ -21,7 +21,7 @@ safetensors scipy onnx sentence_transformers -onnxruntime +onnxruntime-gpu==1.20.0 autoawq==0.2.5 tokenizers==0.20.3 pydantic==2.9.2 @@ -30,8 +30,6 @@ torch==2.5.1 torchvision==0.20.1 # sequence scheduler wheel for hf accelerate rolling batch https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl -# djl converter wheel for text-embedding use case -https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl # flash infer kernels for vllm/lmi-dist https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl # vllm wheel built with pt2.5.1 diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 24ac0d08e0..f6fec9f471 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -985,46 +985,55 @@ def get_model_name(): } text_embedding_model_spec = { - "bge-base": { + "bge-base-rust": { "max_memory_per_gpu": [2.0, 2.0], "batch_size": [1, 8], }, - "e5-base-v2": { + "e5-base-v2-rust": { "max_memory_per_gpu": [2.0, 2.0], "batch_size": [1, 8], }, - "sentence-camembert-large": { + "sentence-camembert-large-rust": { "max_memory_per_gpu": [3.0, 3.0], "batch_size": [1, 8], }, - "roberta-base": { + "roberta-base-rust": { "max_memory_per_gpu": [2.0, 2.0], "batch_size": [1, 8], }, - "msmarco-distilbert-base-v4": { + "msmarco-distilbert-base-v4-rust": { "max_memory_per_gpu": [2.0, 2.0], "batch_size": [1, 8], }, - "bge-reranker": { + "bge-reranker-rust": { "max_memory_per_gpu": [3.0, 3.0], "batch_size": [1, 8], "reranking": True, }, - "e5-mistral-7b": { + "e5-mistral-7b-rust": { "max_memory_per_gpu": [18.0, 18.0], "batch_size": [1, 8], }, - "gte-qwen2-7b": { + "gte-qwen2-7b-rust": { "max_memory_per_gpu": [18.0, 18.0], "batch_size": [1, 8], }, - "gte-large": { + "gte-large-rust": { "max_memory_per_gpu": [3.0, 3.0], "batch_size": [1, 8], }, - "bge-multilingual-gemma2": { + "bge-multilingual-gemma2-rust": { "max_memory_per_gpu": [20.0, 20.0], "batch_size": [1, 8], + }, + "bge-base-onnx": { + "max_memory_per_gpu": [2.0, 2.0], + "batch_size": [1, 8], + }, + "bge-reranker-onnx": { + "max_memory_per_gpu": [3.0, 3.0], + "batch_size": [1, 8], + "reranking": True, } } @@ -1949,7 +1958,6 @@ def test_text_embedding_model(model, model_spec): req = {"inputs": batch_generation(batch_size)} logging.info(f"req {req}") res = send_json(req).json() - logging.info(f"res: {res}") assert len(res) == batch_size if "max_memory_per_gpu" in spec: validate_memory_usage(spec["max_memory_per_gpu"][i]) diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index f941bf3a37..0eba190f36 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -1355,55 +1355,76 @@ } text_embedding_model_list = { - "bge-base": { + "bge-base-rust": { + "engine": "Rust", "option.model_id": "BAAI/bge-base-en-v1.5", "batch_size": 8, }, - "e5-base-v2": { + "e5-base-v2-rust": { + "engine": "Rust", "option.model_id": "intfloat/e5-base-v2", "pooling": "cls", "batch_size": 8, }, - "sentence-camembert-large": { + "sentence-camembert-large-rust": { + "engine": "Rust", "option.model_id": "dangvantuan/sentence-camembert-large", "pooling": "cls", "batch_size": 8, }, - "roberta-base": { + "roberta-base-rust": { + "engine": "Rust", "option.model_id": "relbert/relbert-roberta-base-nce-conceptnet", "pooling": "cls", "batch_size": 8, }, - "msmarco-distilbert-base-v4": { + "msmarco-distilbert-base-v4-rust": { + "engine": "Rust", "option.model_id": "sentence-transformers/msmarco-distilbert-base-v4", "pooling": "cls", "batch_size": 8, }, - "bge-reranker": { + "bge-reranker-rust": { + "engine": "Rust", "option.model_id": "BAAI/bge-reranker-base", "reranking": True, "batch_size": 8, }, - "e5-mistral-7b": { + "e5-mistral-7b-rust": { + "engine": "Rust", "option.model_id": "intfloat/e5-mistral-7b-instruct", "pooling": "cls", "batch_size": 8, }, - "gte-qwen2-7b": { + "gte-qwen2-7b-rust": { + "engine": "Rust", "option.model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct", "pooling": "cls", "batch_size": 8, }, - "gte-large": { + "gte-large-rust": { + "engine": "Rust", "option.model_id": "Alibaba-NLP/gte-large-en-v1.5", "option.trust_remote_code": "true", "pooling": "cls", "batch_size": 8, }, - "bge-multilingual-gemma2": { + "bge-multilingual-gemma2-rust": { + "engine": "Rust", "option.model_id": "BAAI/bge-multilingual-gemma2", "pooling": "cls", "batch_size": 8, + }, + "bge-base-onnx": { + "engine": "OnnxRuntime", + "option.model_id": "BAAI/bge-base-en-v1.5", + "batch_size": 8, + }, + "bge-reranker-onnx": { + "engine": "OnnxRuntime", + "option.model_id": "BAAI/bge-reranker-base", + "reranking": True, + "batch_size": 8, } } @@ -1693,7 +1714,6 @@ def build_text_embedding_model(model): f"{model} is not one of the supporting handler {list(onnx_list.keys())}" ) options = text_embedding_model_list[model] - options["engine"] = "Rust" options["option.task"] = "text_embedding" options["normalize"] = False write_model_artifacts(options) diff --git a/tests/integration/tests.py b/tests/integration/tests.py index f87957cafa..64f17ec31a 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -1053,65 +1053,79 @@ def test_llama31_8b_tp2_pp2_specdec(self): @pytest.mark.gpu class TestTextEmbedding: - def test_bge_base(self): - with Runner('lmi', 'bge-base') as r: - prepare.build_text_embedding_model("bge-base") + def test_bge_base_rust(self): + with Runner('lmi', 'bge-base-rust') as r: + prepare.build_text_embedding_model("bge-base-rust") r.launch() - client.run("text_embedding bge-base".split()) + client.run("text_embedding bge-base-rust".split()) - def test_e5_base_v2(self): - with Runner('lmi', 'e5-base-v2') as r: - prepare.build_text_embedding_model("e5-base-v2") + def test_e5_base_v2_rust(self): + with Runner('lmi', 'e5-base-v2-rust') as r: + prepare.build_text_embedding_model("e5-base-v2-rust") r.launch() - client.run("text_embedding e5-base-v2".split()) + client.run("text_embedding e5-base-v2-rust".split()) - def test_sentence_camembert_large(self): - with Runner('lmi', 'sentence-camembert-large') as r: - prepare.build_text_embedding_model("sentence-camembert-large") + def test_sentence_camembert_large_rust(self): + with Runner('lmi', 'sentence-camembert-large-rust') as r: + prepare.build_text_embedding_model("sentence-camembert-large-rust") r.launch() - client.run("text_embedding sentence-camembert-large".split()) + client.run("text_embedding sentence-camembert-large-rust".split()) - def test_roberta_base(self): - with Runner('lmi', 'roberta-base') as r: - prepare.build_text_embedding_model("roberta-base") + def test_roberta_base_rust(self): + with Runner('lmi', 'roberta-base-rust') as r: + prepare.build_text_embedding_model("roberta-base-rust") r.launch() - client.run("text_embedding roberta-base".split()) + client.run("text_embedding roberta-base-rust".split()) - def test_msmarco_distilbert_base_v4(self): - with Runner('lmi', 'msmarco-distilbert-base-v4') as r: - prepare.build_text_embedding_model("msmarco-distilbert-base-v4") + def test_msmarco_distilbert_base_v4_rust(self): + with Runner('lmi', 'msmarco-distilbert-base-v4-rust') as r: + prepare.build_text_embedding_model( + "msmarco-distilbert-base-v4-rust") r.launch() - client.run("text_embedding msmarco-distilbert-base-v4".split()) + client.run( + "text_embedding msmarco-distilbert-base-v4-rust".split()) + + def test_bge_reranker_rust(self): + with Runner('lmi', 'bge-reranker-rust') as r: + prepare.build_text_embedding_model("bge-reranker-rust") + r.launch() + client.run("text_embedding bge-reranker-rust".split()) + + def test_e5_mistral_7b_rust(self): + with Runner('lmi', 'e5-mistral-7b-rust') as r: + prepare.build_text_embedding_model("e5-mistral-7b-rust") + r.launch() + client.run("text_embedding e5-mistral-7b-rust".split()) - def test_bge_reranker(self): - with Runner('lmi', 'bge-reranker') as r: - prepare.build_text_embedding_model("bge-reranker") + def test_gte_qwen2_7b_rust(self): + with Runner('lmi', 'gte-qwen2-7b-rust') as r: + prepare.build_text_embedding_model("gte-qwen2-7b-rust") r.launch() - client.run("text_embedding bge-reranker".split()) + client.run("text_embedding gte-qwen2-7b-rust".split()) - def test_e5_mistral_7b(self): - with Runner('lmi', 'e5-mistral-7b') as r: - prepare.build_text_embedding_model("e5-mistral-7b") + def test_gte_large_rust(self): + with Runner('lmi', 'gte-large-rust') as r: + prepare.build_text_embedding_model("gte-large-rust") r.launch() - client.run("text_embedding e5-mistral-7b".split()) + client.run("text_embedding gte-large-rust".split()) - def test_gte_qwen2_7b(self): - with Runner('lmi', 'gte-qwen2-7b') as r: - prepare.build_text_embedding_model("gte-qwen2-7b") + def test_bge_multilingual_gemma2_rust(self): + with Runner('lmi', 'bge-multilingual-gemma2-rust') as r: + prepare.build_text_embedding_model("bge-multilingual-gemma2-rust") r.launch() - client.run("text_embedding gte-qwen2-7b".split()) + client.run("text_embedding bge-multilingual-gemma2-rust".split()) - def test_gte_large(self): - with Runner('lmi', 'gte-large') as r: - prepare.build_text_embedding_model("gte-large") + def test_bge_base_onnx(self): + with Runner('lmi', 'bge-base-onnx') as r: + prepare.build_text_embedding_model("bge-base-onnx") r.launch() - client.run("text_embedding gte-large".split()) + client.run("text_embedding bge-base-onnx".split()) - def test_bge_multilingual_gemma2(self): - with Runner('lmi', 'bge-multilingual-gemma2') as r: - prepare.build_text_embedding_model("bge-multilingual-gemma2") + def test_bge_reranker_onnx(self): + with Runner('lmi', 'bge-reranker-onnx') as r: + prepare.build_text_embedding_model("bge-reranker-onnx") r.launch() - client.run("text_embedding bge-multilingual-gemma2".split()) + client.run("text_embedding bge-reranker-onnx".split()) @pytest.mark.gpu