Skip to content

Commit

Permalink
[serving] Updates onnxruntime to 1.20.1 and add integration tests
Browse files Browse the repository at this point in the history
  • Loading branch information
xyang16 committed Nov 25, 2024
1 parent 817c2b9 commit 5df1daf
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 66 deletions.
3 changes: 2 additions & 1 deletion serving/docker/lmi.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ARG djl_version
ARG djl_serving_version
ARG python_version=3.11
ARG djl_torch_version=2.5.1
ARG djl_onnx_version=1.19.0
ARG djl_onnx_version=1.20.0

EXPOSE 8080

Expand Down Expand Up @@ -81,6 +81,7 @@ RUN apt-get update && apt-get install -yq libaio-dev libopenmpi-dev g++ unzip cu
COPY requirements-lmi.txt ./requirements.txt
RUN pip3 install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124 && pip3 cache purge
RUN pip3 install -r requirements.txt \
&& pip3 install https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl --no-deps \
&& git clone https://github.com/neuralmagic/AutoFP8.git \
&& cd AutoFP8 \
&& git reset --hard 4b2092c \
Expand Down
4 changes: 1 addition & 3 deletions serving/docker/requirements-lmi.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ safetensors
scipy
onnx
sentence_transformers
onnxruntime
onnxruntime-gpu==1.20.0
autoawq==0.2.5
tokenizers==0.20.3
pydantic==2.9.2
Expand All @@ -30,8 +30,6 @@ torch==2.5.1
torchvision==0.20.1
# sequence scheduler wheel for hf accelerate rolling batch
https://publish.djl.ai/seq_scheduler/seq_scheduler-0.1.0-py3-none-any.whl
# djl converter wheel for text-embedding use case
https://publish.djl.ai/djl_converter/djl_converter-0.31.0-py3-none-any.whl
# flash infer kernels for vllm/lmi-dist
https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu124torch2.4-cp311-cp311-linux_x86_64.whl
# vllm wheel built with pt2.5.1
Expand Down
30 changes: 19 additions & 11 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,46 +985,55 @@ def get_model_name():
}

text_embedding_model_spec = {
"bge-base": {
"bge-base-rust": {
"max_memory_per_gpu": [2.0, 2.0],
"batch_size": [1, 8],
},
"e5-base-v2": {
"e5-base-v2-rust": {
"max_memory_per_gpu": [2.0, 2.0],
"batch_size": [1, 8],
},
"sentence-camembert-large": {
"sentence-camembert-large-rust": {
"max_memory_per_gpu": [3.0, 3.0],
"batch_size": [1, 8],
},
"roberta-base": {
"roberta-base-rust": {
"max_memory_per_gpu": [2.0, 2.0],
"batch_size": [1, 8],
},
"msmarco-distilbert-base-v4": {
"msmarco-distilbert-base-v4-rust": {
"max_memory_per_gpu": [2.0, 2.0],
"batch_size": [1, 8],
},
"bge-reranker": {
"bge-reranker-rust": {
"max_memory_per_gpu": [3.0, 3.0],
"batch_size": [1, 8],
"reranking": True,
},
"e5-mistral-7b": {
"e5-mistral-7b-rust": {
"max_memory_per_gpu": [18.0, 18.0],
"batch_size": [1, 8],
},
"gte-qwen2-7b": {
"gte-qwen2-7b-rust": {
"max_memory_per_gpu": [18.0, 18.0],
"batch_size": [1, 8],
},
"gte-large": {
"gte-large-rust": {
"max_memory_per_gpu": [3.0, 3.0],
"batch_size": [1, 8],
},
"bge-multilingual-gemma2": {
"bge-multilingual-gemma2-rust": {
"max_memory_per_gpu": [20.0, 20.0],
"batch_size": [1, 8],
},
"bge-base-onnx": {
"max_memory_per_gpu": [2.0, 2.0],
"batch_size": [1, 8],
},
"bge-reranker-onnx": {
"max_memory_per_gpu": [3.0, 3.0],
"batch_size": [1, 8],
"reranking": True,
}
}

Expand Down Expand Up @@ -1949,7 +1958,6 @@ def test_text_embedding_model(model, model_spec):
req = {"inputs": batch_generation(batch_size)}
logging.info(f"req {req}")
res = send_json(req).json()
logging.info(f"res: {res}")
assert len(res) == batch_size
if "max_memory_per_gpu" in spec:
validate_memory_usage(spec["max_memory_per_gpu"][i])
Expand Down
42 changes: 31 additions & 11 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -1355,55 +1355,76 @@
}

text_embedding_model_list = {
"bge-base": {
"bge-base-rust": {
"engine": "Rust",
"option.model_id": "BAAI/bge-base-en-v1.5",
"batch_size": 8,
},
"e5-base-v2": {
"e5-base-v2-rust": {
"engine": "Rust",
"option.model_id": "intfloat/e5-base-v2",
"pooling": "cls",
"batch_size": 8,
},
"sentence-camembert-large": {
"sentence-camembert-large-rust": {
"engine": "Rust",
"option.model_id": "dangvantuan/sentence-camembert-large",
"pooling": "cls",
"batch_size": 8,
},
"roberta-base": {
"roberta-base-rust": {
"engine": "Rust",
"option.model_id": "relbert/relbert-roberta-base-nce-conceptnet",
"pooling": "cls",
"batch_size": 8,
},
"msmarco-distilbert-base-v4": {
"msmarco-distilbert-base-v4-rust": {
"engine": "Rust",
"option.model_id": "sentence-transformers/msmarco-distilbert-base-v4",
"pooling": "cls",
"batch_size": 8,
},
"bge-reranker": {
"bge-reranker-rust": {
"engine": "Rust",
"option.model_id": "BAAI/bge-reranker-base",
"reranking": True,
"batch_size": 8,
},
"e5-mistral-7b": {
"e5-mistral-7b-rust": {
"engine": "Rust",
"option.model_id": "intfloat/e5-mistral-7b-instruct",
"pooling": "cls",
"batch_size": 8,
},
"gte-qwen2-7b": {
"gte-qwen2-7b-rust": {
"engine": "Rust",
"option.model_id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
"pooling": "cls",
"batch_size": 8,
},
"gte-large": {
"gte-large-rust": {
"engine": "Rust",
"option.model_id": "Alibaba-NLP/gte-large-en-v1.5",
"option.trust_remote_code": "true",
"pooling": "cls",
"batch_size": 8,
},
"bge-multilingual-gemma2": {
"bge-multilingual-gemma2-rust": {
"engine": "Rust",
"option.model_id": "BAAI/bge-multilingual-gemma2",
"pooling": "cls",
"batch_size": 8,
},
"bge-base-onnx": {
"engine": "OnnxRuntime",
"option.model_id": "BAAI/bge-base-en-v1.5",
"batch_size": 8,
},
"bge-reranker-onnx": {
"engine": "OnnxRuntime",
"option.model_id": "BAAI/bge-reranker-base",
"reranking": True,
"batch_size": 8,
}
}

Expand Down Expand Up @@ -1693,7 +1714,6 @@ def build_text_embedding_model(model):
f"{model} is not one of the supporting handler {list(onnx_list.keys())}"
)
options = text_embedding_model_list[model]
options["engine"] = "Rust"
options["option.task"] = "text_embedding"
options["normalize"] = False
write_model_artifacts(options)
Expand Down
94 changes: 54 additions & 40 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,65 +1053,79 @@ def test_llama31_8b_tp2_pp2_specdec(self):
@pytest.mark.gpu
class TestTextEmbedding:

def test_bge_base(self):
with Runner('lmi', 'bge-base') as r:
prepare.build_text_embedding_model("bge-base")
def test_bge_base_rust(self):
with Runner('lmi', 'bge-base-rust') as r:
prepare.build_text_embedding_model("bge-base-rust")
r.launch()
client.run("text_embedding bge-base".split())
client.run("text_embedding bge-base-rust".split())

def test_e5_base_v2(self):
with Runner('lmi', 'e5-base-v2') as r:
prepare.build_text_embedding_model("e5-base-v2")
def test_e5_base_v2_rust(self):
with Runner('lmi', 'e5-base-v2-rust') as r:
prepare.build_text_embedding_model("e5-base-v2-rust")
r.launch()
client.run("text_embedding e5-base-v2".split())
client.run("text_embedding e5-base-v2-rust".split())

def test_sentence_camembert_large(self):
with Runner('lmi', 'sentence-camembert-large') as r:
prepare.build_text_embedding_model("sentence-camembert-large")
def test_sentence_camembert_large_rust(self):
with Runner('lmi', 'sentence-camembert-large-rust') as r:
prepare.build_text_embedding_model("sentence-camembert-large-rust")
r.launch()
client.run("text_embedding sentence-camembert-large".split())
client.run("text_embedding sentence-camembert-large-rust".split())

def test_roberta_base(self):
with Runner('lmi', 'roberta-base') as r:
prepare.build_text_embedding_model("roberta-base")
def test_roberta_base_rust(self):
with Runner('lmi', 'roberta-base-rust') as r:
prepare.build_text_embedding_model("roberta-base-rust")
r.launch()
client.run("text_embedding roberta-base".split())
client.run("text_embedding roberta-base-rust".split())

def test_msmarco_distilbert_base_v4(self):
with Runner('lmi', 'msmarco-distilbert-base-v4') as r:
prepare.build_text_embedding_model("msmarco-distilbert-base-v4")
def test_msmarco_distilbert_base_v4_rust(self):
with Runner('lmi', 'msmarco-distilbert-base-v4-rust') as r:
prepare.build_text_embedding_model(
"msmarco-distilbert-base-v4-rust")
r.launch()
client.run("text_embedding msmarco-distilbert-base-v4".split())
client.run(
"text_embedding msmarco-distilbert-base-v4-rust".split())

def test_bge_reranker_rust(self):
with Runner('lmi', 'bge-reranker-rust') as r:
prepare.build_text_embedding_model("bge-reranker-rust")
r.launch()
client.run("text_embedding bge-reranker-rust".split())

def test_e5_mistral_7b_rust(self):
with Runner('lmi', 'e5-mistral-7b-rust') as r:
prepare.build_text_embedding_model("e5-mistral-7b-rust")
r.launch()
client.run("text_embedding e5-mistral-7b-rust".split())

def test_bge_reranker(self):
with Runner('lmi', 'bge-reranker') as r:
prepare.build_text_embedding_model("bge-reranker")
def test_gte_qwen2_7b_rust(self):
with Runner('lmi', 'gte-qwen2-7b-rust') as r:
prepare.build_text_embedding_model("gte-qwen2-7b-rust")
r.launch()
client.run("text_embedding bge-reranker".split())
client.run("text_embedding gte-qwen2-7b-rust".split())

def test_e5_mistral_7b(self):
with Runner('lmi', 'e5-mistral-7b') as r:
prepare.build_text_embedding_model("e5-mistral-7b")
def test_gte_large_rust(self):
with Runner('lmi', 'gte-large-rust') as r:
prepare.build_text_embedding_model("gte-large-rust")
r.launch()
client.run("text_embedding e5-mistral-7b".split())
client.run("text_embedding gte-large-rust".split())

def test_gte_qwen2_7b(self):
with Runner('lmi', 'gte-qwen2-7b') as r:
prepare.build_text_embedding_model("gte-qwen2-7b")
def test_bge_multilingual_gemma2_rust(self):
with Runner('lmi', 'bge-multilingual-gemma2-rust') as r:
prepare.build_text_embedding_model("bge-multilingual-gemma2-rust")
r.launch()
client.run("text_embedding gte-qwen2-7b".split())
client.run("text_embedding bge-multilingual-gemma2-rust".split())

def test_gte_large(self):
with Runner('lmi', 'gte-large') as r:
prepare.build_text_embedding_model("gte-large")
def test_bge_base_onnx(self):
with Runner('lmi', 'bge-base-onnx') as r:
prepare.build_text_embedding_model("bge-base-onnx")
r.launch()
client.run("text_embedding gte-large".split())
client.run("text_embedding bge-base-onnx".split())

def test_bge_multilingual_gemma2(self):
with Runner('lmi', 'bge-multilingual-gemma2') as r:
prepare.build_text_embedding_model("bge-multilingual-gemma2")
def test_bge_reranker_onnx(self):
with Runner('lmi', 'bge-reranker-onnx') as r:
prepare.build_text_embedding_model("bge-reranker-onnx")
r.launch()
client.run("text_embedding bge-multilingual-gemma2".split())
client.run("text_embedding bge-reranker-onnx".split())


@pytest.mark.gpu
Expand Down

0 comments on commit 5df1daf

Please sign in to comment.