demo

vllm-project · Sep 30, 2024 · 5f05fa9 · 5f05fa9
1 parent b29e716
commit 5f05fa9
Show file tree

Hide file tree

Showing 7 changed files with 188 additions and 0 deletions.
diff --git a/demo_temporary/__init__.py b/demo_temporary/__init__.py
diff --git a/demo_temporary/benchmarks/__init__.py b/demo_temporary/benchmarks/__init__.py
diff --git a/demo_temporary/benchmarks/benchmark_attention_impl.py b/demo_temporary/benchmarks/benchmark_attention_impl.py
@@ -0,0 +1,90 @@
+import os
+import random
+import time
+
+
+def benchmark_vllm(args):
+    random.seed(args.seed)
+    os.environ["VLLM_ATTENTION_BACKEND"] = args.attention_impl
+
+    import gc
+
+    import torch
+
+    from vllm.wde.encode_only.arg_utils import (  # noqa: E501
+        EncodeOnlyEngineArgs as EngineArgs)
+    from vllm.wde.entrypoints.llm import LLMEngine
+
+    prompt = "if" * args.input_len
+    requests = [prompt for _ in range(args.num_prompts)]
+
+    engine_args = EngineArgs(model=args.model,
+                             tokenizer=args.tokenizer,
+                             seed=args.seed,
+                             trust_remote_code=args.trust_remote_code,
+                             dtype=args.dtype,
+                             max_model_len=args.max_model_len,
+                             device=args.device,
+                             max_num_seqs=32,
+                             scheduling=args.scheduling)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    for batchsize in args.batchsize:
+        engine.engine_config.scheduler_config.set_args(max_num_seqs=batchsize)
+
+        start = time.perf_counter()
+        for request_id, prompt in enumerate(requests):
+            engine.add_request(str(request_id), prompt)
+
+        n_step = 0
+        while engine.has_unfinished_requests():
+            engine.step()
+            n_step += 1
+        end = time.perf_counter()
+
+        elapsed_time = end - start
+        delay = elapsed_time / n_step
+
+        print(f"Batchsize {batchsize}, Throughput: "
+              f"{len(requests) / elapsed_time:.4f} requests/s, "
+              f"Delay {delay * 1000:0.2f} ms, n_step {n_step}")
+
+        engine.executor.shutdown_execute_loop()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == '__main__':
+    from easydict import EasyDict as edict
+
+    from vllm.wde.prefill_only.layers.attention.selector import AttentionImpls
+    args = edict()
+
+    args.input_len = 256
+    args.num_prompts = 10000
+
+    args.model = "google-bert/bert-base-uncased"
+
+    args.trust_remote_code = False
+    args.tokenizer = args.model
+    args.seed = 0
+    args.max_model_len = None
+    args.device = "cuda"
+    args.batchsize = [1, 2, 4, 8, 16, 32, 64]
+    args.scheduling = "double_buffer"
+
+    from concurrent.futures import ProcessPoolExecutor
+
+    def run_vllm(args):
+        with ProcessPoolExecutor(1) as executor:
+            f = executor.submit(benchmark_vllm, args)
+            f.result()
+
+    for dtype, attention_impls in AttentionImpls.items():
+        print("dtype:", dtype)
+        for attention_impl in attention_impls:
+            print("attention_impl:", attention_impl)
+            args.attention_impl = attention_impl
+            args.dtype = dtype
+            run_vllm(args)
diff --git a/demo_temporary/benchmarks/benchmark_bert.py b/demo_temporary/benchmarks/benchmark_bert.py
@@ -0,0 +1,83 @@
+import random
+import time
+
+
+def benchmark_vllm(args):
+    random.seed(args.seed)
+
+    import gc
+
+    import torch
+
+    from vllm.wde.encode_only.arg_utils import (  # noqa: E501
+        EncodeOnlyEngineArgs as EngineArgs)
+    from vllm.wde.entrypoints.llm import LLMEngine
+
+    prompt = "if" * args.input_len
+    requests = [prompt for _ in range(args.num_prompts)]
+
+    engine_args = EngineArgs(model=args.model,
+                             tokenizer=args.tokenizer,
+                             seed=args.seed,
+                             trust_remote_code=args.trust_remote_code,
+                             dtype=args.dtype,
+                             max_model_len=args.max_model_len,
+                             device=args.device,
+                             max_num_seqs=32,
+                             scheduling=args.scheduling)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    for batchsize in args.batchsize:
+        engine.engine_config.scheduler_config.set_args(max_num_seqs=batchsize)
+
+        start = time.perf_counter()
+        for request_id, prompt in enumerate(requests):
+            engine.add_request(str(request_id), prompt)
+
+        n_step = 0
+        while engine.has_unfinished_requests():
+            engine.step()
+            n_step += 1
+        end = time.perf_counter()
+
+        elapsed_time = end - start
+        delay = elapsed_time / n_step
+
+        print(f"Batchsize {batchsize}, Throughput: "
+              f"{len(requests) / elapsed_time:.4f} requests/s, "
+              f"Delay {delay * 1000:0.2f} ms, n_step {n_step}")
+
+        engine.executor.shutdown_execute_loop()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == '__main__':
+    from easydict import EasyDict as edict
+    args = edict()
+
+    args.input_len = 256
+    args.num_prompts = 10000
+
+    args.model = "google-bert/bert-base-uncased"
+
+    args.trust_remote_code = False
+    args.tokenizer = args.model
+    args.seed = 0
+    args.max_model_len = None
+    args.dtype = "half"
+    args.device = "cuda"
+    args.batchsize = [1, 2, 4, 8, 16, 32, 64]
+
+    from concurrent.futures import ProcessPoolExecutor
+
+    def run_vllm(args):
+        with ProcessPoolExecutor(1) as executor:
+            f = executor.submit(benchmark_vllm, args)
+            f.result()
+
+    for scheduling in ["sync", "async", "double_buffer"]:
+        print(scheduling)
+        args.scheduling = scheduling
+        run_vllm(args)
diff --git a/demo_temporary/examples/__init__.py b/demo_temporary/examples/__init__.py
diff --git a/demo_temporary/examples/offline_inference_bert.py b/demo_temporary/examples/offline_inference_bert.py
@@ -0,0 +1,14 @@
+from vllm.wde.entrypoints.llm import LLM
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+llm = LLM(model="google-bert/bert-base-uncased")
+
+outputs = llm.encode(prompts)
+for output in outputs:
+    print(output.outputs.shape)
diff --git a/tests/wde/core/processor/test_input_processor.py b/tests/wde/core/processor/test_input_processor.py
@@ -1,3 +1,4 @@
+# mypy: ignore-errors
 import pytest
 
 from vllm.wde.core.processor.input_processor import TextInputProcessor