Skip to content

Commit

Permalink
demo
Browse files Browse the repository at this point in the history
  • Loading branch information
noooop committed Sep 30, 2024
1 parent b29e716 commit 5f05fa9
Show file tree
Hide file tree
Showing 7 changed files with 188 additions and 0 deletions.
Empty file added demo_temporary/__init__.py
Empty file.
Empty file.
90 changes: 90 additions & 0 deletions demo_temporary/benchmarks/benchmark_attention_impl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import random
import time


def benchmark_vllm(args):
random.seed(args.seed)
os.environ["VLLM_ATTENTION_BACKEND"] = args.attention_impl

import gc

import torch

from vllm.wde.encode_only.arg_utils import ( # noqa: E501
EncodeOnlyEngineArgs as EngineArgs)
from vllm.wde.entrypoints.llm import LLMEngine

prompt = "if" * args.input_len
requests = [prompt for _ in range(args.num_prompts)]

engine_args = EngineArgs(model=args.model,
tokenizer=args.tokenizer,
seed=args.seed,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
device=args.device,
max_num_seqs=32,
scheduling=args.scheduling)

engine = LLMEngine.from_engine_args(engine_args)

for batchsize in args.batchsize:
engine.engine_config.scheduler_config.set_args(max_num_seqs=batchsize)

start = time.perf_counter()
for request_id, prompt in enumerate(requests):
engine.add_request(str(request_id), prompt)

n_step = 0
while engine.has_unfinished_requests():
engine.step()
n_step += 1
end = time.perf_counter()

elapsed_time = end - start
delay = elapsed_time / n_step

print(f"Batchsize {batchsize}, Throughput: "
f"{len(requests) / elapsed_time:.4f} requests/s, "
f"Delay {delay * 1000:0.2f} ms, n_step {n_step}")

engine.executor.shutdown_execute_loop()
gc.collect()
torch.cuda.empty_cache()


if __name__ == '__main__':
from easydict import EasyDict as edict

from vllm.wde.prefill_only.layers.attention.selector import AttentionImpls
args = edict()

args.input_len = 256
args.num_prompts = 10000

args.model = "google-bert/bert-base-uncased"

args.trust_remote_code = False
args.tokenizer = args.model
args.seed = 0
args.max_model_len = None
args.device = "cuda"
args.batchsize = [1, 2, 4, 8, 16, 32, 64]
args.scheduling = "double_buffer"

from concurrent.futures import ProcessPoolExecutor

def run_vllm(args):
with ProcessPoolExecutor(1) as executor:
f = executor.submit(benchmark_vllm, args)
f.result()

for dtype, attention_impls in AttentionImpls.items():
print("dtype:", dtype)
for attention_impl in attention_impls:
print("attention_impl:", attention_impl)
args.attention_impl = attention_impl
args.dtype = dtype
run_vllm(args)
83 changes: 83 additions & 0 deletions demo_temporary/benchmarks/benchmark_bert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import random
import time


def benchmark_vllm(args):
random.seed(args.seed)

import gc

import torch

from vllm.wde.encode_only.arg_utils import ( # noqa: E501
EncodeOnlyEngineArgs as EngineArgs)
from vllm.wde.entrypoints.llm import LLMEngine

prompt = "if" * args.input_len
requests = [prompt for _ in range(args.num_prompts)]

engine_args = EngineArgs(model=args.model,
tokenizer=args.tokenizer,
seed=args.seed,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
max_model_len=args.max_model_len,
device=args.device,
max_num_seqs=32,
scheduling=args.scheduling)

engine = LLMEngine.from_engine_args(engine_args)

for batchsize in args.batchsize:
engine.engine_config.scheduler_config.set_args(max_num_seqs=batchsize)

start = time.perf_counter()
for request_id, prompt in enumerate(requests):
engine.add_request(str(request_id), prompt)

n_step = 0
while engine.has_unfinished_requests():
engine.step()
n_step += 1
end = time.perf_counter()

elapsed_time = end - start
delay = elapsed_time / n_step

print(f"Batchsize {batchsize}, Throughput: "
f"{len(requests) / elapsed_time:.4f} requests/s, "
f"Delay {delay * 1000:0.2f} ms, n_step {n_step}")

engine.executor.shutdown_execute_loop()
gc.collect()
torch.cuda.empty_cache()


if __name__ == '__main__':
from easydict import EasyDict as edict
args = edict()

args.input_len = 256
args.num_prompts = 10000

args.model = "google-bert/bert-base-uncased"

args.trust_remote_code = False
args.tokenizer = args.model
args.seed = 0
args.max_model_len = None
args.dtype = "half"
args.device = "cuda"
args.batchsize = [1, 2, 4, 8, 16, 32, 64]

from concurrent.futures import ProcessPoolExecutor

def run_vllm(args):
with ProcessPoolExecutor(1) as executor:
f = executor.submit(benchmark_vllm, args)
f.result()

for scheduling in ["sync", "async", "double_buffer"]:
print(scheduling)
args.scheduling = scheduling
run_vllm(args)
Empty file.
14 changes: 14 additions & 0 deletions demo_temporary/examples/offline_inference_bert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from vllm.wde.entrypoints.llm import LLM

prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

llm = LLM(model="google-bert/bert-base-uncased")

outputs = llm.encode(prompts)
for output in outputs:
print(output.outputs.shape)
1 change: 1 addition & 0 deletions tests/wde/core/processor/test_input_processor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# mypy: ignore-errors
import pytest

from vllm.wde.core.processor.input_processor import TextInputProcessor
Expand Down

0 comments on commit 5f05fa9

Please sign in to comment.