From 3427a91d3743e99250b88b58a93afb5431687e5c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 20 Jan 2025 15:00:59 +0800 Subject: [PATCH] [Core] Interface for accessing model from `VllmRunner` (#10353) Signed-off-by: DarkLight1337 Signed-off-by: Bowen Wang --- tests/conftest.py | 5 + tests/engine/test_custom_executor.py | 4 +- .../test_model_load_with_params.py | 64 ++--- .../decoder_only/language/test_jamba.py | 7 +- .../decoder_only/language/test_mamba.py | 7 +- .../decoder_only/language/test_models.py | 7 +- .../vision_language/test_qwen2_vl.py | 49 ++-- .../embedding/language/test_cls_models.py | 7 +- .../embedding/language/test_embedding.py | 7 +- tests/quantization/test_compressed_tensors.py | 242 ++++++++++-------- tests/quantization/test_fp8.py | 52 ++-- tests/quantization/test_lm_head.py | 37 +-- tests/quantization/test_quark.py | 23 +- tests/tensorizer_loader/test_tensorizer.py | 34 ++- vllm/engine/llm_engine.py | 17 +- vllm/entrypoints/llm.py | 52 ++-- vllm/executor/executor_base.py | 50 +++- vllm/executor/mp_distributed_executor.py | 2 +- .../model_executor/model_loader/tensorizer.py | 17 +- vllm/spec_decode/ngram_worker.py | 12 +- .../spec_decode/smaller_tp_proposer_worker.py | 12 + vllm/spec_decode/spec_decode_worker.py | 4 + vllm/v1/executor/multiproc_executor.py | 16 +- vllm/v1/worker/gpu_model_runner.py | 3 + vllm/v1/worker/gpu_worker.py | 4 + vllm/worker/cpu_model_runner.py | 3 + vllm/worker/hpu_model_runner.py | 4 + vllm/worker/model_runner.py | 3 + vllm/worker/model_runner_base.py | 9 +- vllm/worker/neuron_model_runner.py | 3 + vllm/worker/openvino_model_runner.py | 3 + vllm/worker/openvino_worker.py | 4 + vllm/worker/tpu_model_runner.py | 3 + vllm/worker/worker_base.py | 12 + vllm/worker/xpu_model_runner.py | 3 + 35 files changed, 474 insertions(+), 307 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 95af4ac1eb17b..279c1bf9a3776 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -244,6 +244,7 @@ def video_assets() -> _VideoAssets: _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_R = TypeVar("_R") class HfRunner: @@ -930,6 +931,10 @@ def score( req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + executor = self.model.llm_engine.model_executor + return executor.apply_model(func) + def __enter__(self): return self diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index fdfcd4f4c9d50..0e33f3662da82 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -51,7 +51,9 @@ def test_custom_executor(model, tmp_path): assert not os.path.exists(".marker") engine_args = EngineArgs( - model=model, distributed_executor_backend=CustomUniExecutor) + model=model, + distributed_executor_backend=CustomUniExecutor, + ) engine = LLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 0609fd96825e3..9c1f784c1c93b 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME, revision=REVISION, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + + vllm_model.apply_model(check_model) + # assert output assert output @@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME_ROBERTA, revision=REVISION_ROBERTA, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -84,11 +84,12 @@ def test_roberta_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" assert not model_tokenizer.tokenizer_config["do_lower_case"] - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + vllm_model.apply_model(check_model) # assert output assert output @@ -103,17 +104,18 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): model_name = "FacebookAI/roberta-base" with vllm_runner(model_name=model_name, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_tokenizer = model.model.llm_engine.tokenizer + model_tokenizer = vllm_model.model.llm_engine.tokenizer assert model_tokenizer.tokenizer_id == model_name - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert not hasattr(model, "lm_head") - assert isinstance(model, RobertaEmbeddingModel) - assert isinstance(model._pooler, CLSPool) + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert not hasattr(model, "lm_head") + assert isinstance(model._pooler, CLSPool) + + vllm_model.apply_model(check_model) assert output diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 057b04349e8b7..2e06b10fbb827 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,10 +33,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 06739e8f02253..1ad4f5aae8f5b 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,10 +51,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 4e110366a09f3..c7efa4edbbc0a 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -73,10 +73,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_logprobs_close( outputs_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 2fd22f0cc88ec..5a485f3d81747 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -5,7 +5,6 @@ import torch from PIL import Image -from vllm.entrypoints.llm import LLM from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video @@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, @@ -106,16 +105,18 @@ def batch_make_image_embeddings( image_grid_thw = preprocess_result["image_grid_thw"] # pixel values to embeddings & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker. \ - model_runner.model.visual + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to(visual.device, - dtype=torch.int64) - image_embeds = visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptImageEmbeddingInput] = [] @@ -150,7 +151,7 @@ def batch_make_image_embeddings( def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -187,16 +188,18 @@ def batch_make_video_embeddings( video_grid_thw = preprocess_result["video_grid_thw"] # pixel values to embeddings & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker.\ - model_runner.model.visual + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to(visual.device, - dtype=torch.int64) - video_embeds = visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptVideoEmbeddingInput] = [] @@ -278,9 +281,9 @@ def run_embedding_input_test( max_tokens, num_logprobs=num_logprobs, images=batch_make_image_embeddings( - images, processor, vllm_model.model) if images else None, + images, processor, vllm_model) if images else None, videos=batch_make_video_embeddings( - videos, processor, vllm_model.model) if videos else None) + videos, processor, vllm_model) if videos else None) for prompts, images, videos in inputs ] diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6673a9fc22f69..0cbe4afe96c0a 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -24,10 +24,13 @@ def test_classification_models( ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) with hf_runner(model, dtype=dtype, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index bb47d14807b55..e17198e385475 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -62,10 +62,13 @@ def test_models( max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_embeddings_close( embeddings_0_lst=hf_outputs, diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 92436889ecffe..0cd86cef0a475 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -30,50 +30,55 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - o_proj = layer.self_attn.o_proj - gate_up_proj = layer.mlp.gate_up_proj - down_proj = layer.mlp.down_proj - - # assert zp for symmetric and asymmetric cases - def zp_valid(zp: Optional[torch.Tensor]): - if is_symmetric: - return zp is None - - return zp is not None and zp.dtype is torch.int32 - - assert zp_valid(qkv_proj.input_zero_point) - assert zp_valid(o_proj.input_zero_point) - assert zp_valid(gate_up_proj.input_zero_point) - assert zp_valid(down_proj.input_zero_point) - - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(gate_up_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(down_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.is_static_input_scheme - expected_type = torch.int8 - - assert qkv_proj.weight.dtype is expected_type - assert o_proj.weight.dtype is expected_type - assert gate_up_proj.weight.dtype is expected_type - - if qkv_proj.scheme.strategy == "tensor": - # Make sure it is a channelwise buffer - # After running process_weights_after_loading - assert len(qkv_proj.weight_scale.shape) == 2 - assert qkv_proj.weight_scale.shape[0] == shape_0 - assert qkv_proj.weight_scale.shape[1] == 1 - assert qkv_proj.weight_scale.dtype is torch.float32 - assert qkv_proj.input_scale.dtype is torch.float32 + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + # assert zp for symmetric and asymmetric cases + def zp_valid(zp: Optional[torch.Tensor]): + if is_symmetric: + return zp is None + + return zp is not None and zp.dtype is torch.int32 + + assert zp_valid(qkv_proj.input_zero_point) + assert zp_valid(o_proj.input_zero_point) + assert zp_valid(gate_up_proj.input_zero_point) + assert zp_valid(down_proj.input_zero_point) + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(o_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(gate_up_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(down_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.is_static_input_scheme + expected_type = torch.int8 + + assert qkv_proj.weight.dtype is expected_type + assert o_proj.weight.dtype is expected_type + assert gate_up_proj.weight.dtype is expected_type + + if qkv_proj.scheme.strategy == "tensor": + # Make sure it is a channelwise buffer + # After running process_weights_after_loading + assert len(qkv_proj.weight_scale.shape) == 2 + assert qkv_proj.weight_scale.shape[0] == shape_0 + assert qkv_proj.weight_scale.shape[1] == 1 + assert qkv_proj.weight_scale.dtype is torch.float32 + assert qkv_proj.input_scale.dtype is torch.float32 + + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -129,16 +134,20 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + assert not qkv_proj.scheme.is_static_input_scheme + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.weight.dtype is torch.int8 - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - assert not qkv_proj.scheme.is_static_input_scheme - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.weight.dtype is torch.int8 + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -152,19 +161,24 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) + def check_model(model): + layer = model.model.layers[0] - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.group_size == (-1 if group is None else group) + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) - assert qkv_proj.weight_packed.dtype is torch.int32 - assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.scheme.pack_factor == pack_factor + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.group_size == (-1 + if group is None else group) + + assert qkv_proj.weight_packed.dtype is torch.int32 + assert qkv_proj.weight_scale.dtype is torch.float16 + assert qkv_proj.scheme.pack_factor == pack_factor + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -173,14 +187,18 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): def test_compressed_tensors_w4a16_marlin24(vllm_runner): model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) - assert qkv_proj.weight_packed.dtype is torch.int32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) + assert qkv_proj.weight_packed.dtype is torch.int32 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -189,23 +207,27 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner): def test_compressed_tensors_fp8(vllm_runner): model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance( - qkv_proj.scheme, - (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.input_scale.dtype is torch.float32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance( + qkv_proj.scheme, + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) - if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - assert qkv_proj.weight_scale.dtype is torch.float32 - assert len(qkv_proj.weight_scale.shape) == 0 + assert qkv_proj.input_scale.dtype is torch.float32 + + if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + assert qkv_proj.weight_scale.dtype is torch.float32 + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -248,12 +270,15 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -273,12 +298,15 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.int8 - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.int8 + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -293,20 +321,24 @@ def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensors24) - - assert qkv_proj.scheme.weight_quant is None - assert qkv_proj.scheme.input_quant is None - assert not qkv_proj.scheme.quantized - assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map - sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 - assert sparsity_map.get("Linear").format == "dense" - assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensors24) + + assert qkv_proj.scheme.weight_quant is None + assert qkv_proj.scheme.input_quant is None + assert not qkv_proj.scheme.quantized + assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map + sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 + assert sparsity_map.get("Linear").format == "dense" + assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index a0c1d7e24c503..4bff734746297 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -49,13 +49,17 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - attn = model.model.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - # NOTE: it is valid for scales to be 1.0 (default value), but we know - # these checkpoints have scales < 1.0 - assert 0.0 < attn._k_scale < 1.0 - assert 0.0 < attn._v_scale < 1.0 + def check_model(model): + attn = model.model.layers[0].self_attn.attn + + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + + # NOTE: it is valid for scales to be 1.0 (default value), but + # we know these checkpoints have scales < 1.0 + assert 0.0 < attn._k_scale < 1.0 + assert 0.0 < attn._v_scale < 1.0 + + llm.apply_model(check_model) # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy @@ -77,22 +81,24 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, quantization="fp8", kv_cache_dtype=kv_cache_dtype) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, Fp8LinearMethod) - if kv_cache_dtype == "fp8": - attn = model.model.decoder.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - assert attn._k_scale == 1.0 - assert attn._v_scale == 1.0 - - if current_platform.has_device_capability(89) and not force_marlin: - # For GPUs with hardware support, we keep weights in fp8 - assert fc1.weight.dtype == torch.float8_e4m3fn - else: - # For GPUs without hardware support, we pack the fp8 weights - # for weight-only quantization using Marlin kernels - assert fc1.weight.dtype == torch.int32 + def check_model(model): + fc1 = model.model.decoder.layers[0].fc1 + assert isinstance(fc1.quant_method, Fp8LinearMethod) + if kv_cache_dtype == "fp8": + attn = model.model.decoder.layers[0].self_attn.attn + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + assert attn._k_scale == 1.0 + assert attn._v_scale == 1.0 + + if current_platform.has_device_capability(89) and not force_marlin: + # For GPUs with hardware support, we keep weights in fp8 + assert fc1.weight.dtype == torch.float8_e4m3fn + else: + # For GPUs without hardware support, we pack the fp8 weights + # for weight-only quantization using Marlin kernels + assert fc1.weight.dtype == torch.int32 + + llm.apply_model(check_model) @pytest.mark.skipif(not is_quant_method_supported("fp8"), diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index ad526a4065101..fa2d9645ea47f 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -28,20 +28,23 @@ def test_lm_head( model_lm_head_quant: Tuple[str, bool], ) -> None: model, lm_head_quantized = model_lm_head_quant - vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048) - - lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model.lm_head) - - if lm_head_quantized: - assert isinstance( - lm_head_layer.linear_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod)) - else: - assert isinstance(lm_head_layer.linear_method, - UnquantizedEmbeddingMethod) - - print( - vllm_model.generate_greedy(prompts=["Hello my name is"], - max_tokens=10)[0][1]) - del vllm_model + + with vllm_runner(model, dtype=torch.float16, + max_model_len=2048) as vllm_model: + + def check_model(model): + lm_head_layer = model.lm_head + + if lm_head_quantized: + assert isinstance(lm_head_layer.linear_method, + (GPTQLinearMethod, GPTQMarlinLinearMethod, + MarlinLinearMethod)) + else: + assert isinstance(lm_head_layer.linear_method, + UnquantizedEmbeddingMethod) + + vllm_model.apply_model(check_model) + + print( + vllm_model.generate_greedy(prompts=["Hello my name is"], + max_tokens=10)[0][1]) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 27493a682b746..11382ad708faa 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -12,19 +12,22 @@ def test_quark_fp8(vllm_runner): model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) - assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + qkv_proj = layer.self_attn.qkv_proj - if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz - assert len(qkv_proj.weight_scale.shape) == 0 + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + + if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index bf409d2d97aa1..6e7eec1c6ab34 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -3,6 +3,7 @@ import os import pathlib import subprocess +from functools import partial from unittest.mock import MagicMock, patch import openai @@ -24,7 +25,6 @@ # yapf: enable from vllm.utils import PlaceholderModule, import_from_path -from ..conftest import VllmRunner from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip @@ -58,16 +58,6 @@ def is_curl_installed(): return False -def get_torch_model(vllm_runner: VllmRunner): - return vllm_runner \ - .model \ - .llm_engine \ - .model_executor \ - .driver_worker \ - .model_runner \ - .model - - def write_keyfile(keyfile_path: str): encryption_params = EncryptionParams.random() pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True) @@ -121,8 +111,10 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - serialize_vllm_model(get_torch_model(vllm_model), - config_for_serializing) + + vllm_model.apply_model( + partial(serialize_vllm_model, + tensorizer_config=config_for_serializing)) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) @@ -175,8 +167,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) with vllm_runner( model_ref, @@ -215,8 +209,10 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) model_loader_extra_config = { "tensorizer_uri": str(model_path), @@ -337,7 +333,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - serialize_vllm_model(get_torch_model(vllm_model), config) + + vllm_model.apply_model( + partial(serialize_vllm_model, tensorizer_config=config)) assert is_vllm_tensorized(config) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index b6bba1d67b408..6a6b4a14a4c49 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -5,10 +5,10 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, - Iterable, List, Mapping, NamedTuple, Optional) +from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable, + List, Mapping, NamedTuple, Optional) from typing import Sequence as GenericSequence -from typing import Set, Tuple, Type, Union, cast, overload +from typing import Set, Type, Union, cast, overload import torch from typing_extensions import TypeVar, deprecated @@ -1818,17 +1818,6 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.model_executor.stop_profile() - def collective_rpc(self, - method: Union[str, Callable], - timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: - """ - See LLM.collective_rpc for more details. - """ - return self.model_executor.collective_rpc(method, timeout, args, - kwargs) - def check_health(self) -> None: if self.tokenizer: self.tokenizer.check_health() diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0cfe6be9ac767..27386daa4bbc9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -5,8 +5,9 @@ Tuple, Type, Union, cast, overload) import cloudpickle +import torch.nn as nn from tqdm import tqdm -from typing_extensions import deprecated +from typing_extensions import TypeVar, deprecated from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, @@ -42,6 +43,8 @@ logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -464,25 +467,42 @@ def generate( return self.engine_class.validate_outputs(outputs, RequestOutput) def collective_rpc(self, - method: Union[str, Callable], + method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. + """ + executor = self.llm_engine.model_executor + return executor.collective_rpc(method, timeout, args, kwargs) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: """ - Run a method on all workers, with homogeneous arguments. - The main extension point for the LLM entrypoint. - Users can provide custom worker class through `worker_cls` - argument, and implement new methods in the worker class. - Then, users can call the new methods through this API. - It is recommended to use this API to only pass control messages, - and set up data-plane communication to pass data. - The method can also be a callable, which will be serialized - and sent to all workers to execute. - If the method is a callable, it should accept an additional - `self` argument, in addition to the arguments passed in `args` - and `kwargs`. The `self` argument will be the worker object. + Run a function directly on the model inside each worker, + returning the result for each of them. """ - return self.llm_engine.collective_rpc(method, timeout, args, kwargs) + executor = self.llm_engine.model_executor + return executor.apply_model(func) def beam_search( self, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index e5952b388c543..859e105f15d97 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -3,6 +3,9 @@ from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, Union) +import torch.nn as nn +from typing_extensions import TypeVar + from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -11,9 +14,12 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class ExecutorBase(ABC): """Base class for all executors. @@ -44,22 +50,37 @@ def __init__( @abstractmethod def _init_executor(self) -> None: - pass + raise NotImplementedError @abstractmethod def collective_rpc(self, - method: Union[str, Callable], + method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: """ - The main interface of the executor to run a method on all workers, - with homogeneous arguments. - If the args are heterogeneous, then we can pack them into a list, - and unpack them in the method of every worker, because every worker - knows their own rank. + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. """ - pass + raise NotImplementedError def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and @@ -97,6 +118,17 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + + def rpc_func(worker: WorkerBase) -> _R: + return func(worker.get_model()) + + return self.collective_rpc(rpc_func) + def execute_model( self, execute_model_req: ExecuteModelRequest ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index a80b0ee8b3122..78c86321d861d 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -148,7 +148,7 @@ def _run_workers( async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, - ) -> Any: + ) -> List[Any]: """Runs the given method on all workers. Args: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index fbd4937112e11..5b4757072353f 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -459,16 +459,7 @@ def tensorize_vllm_model(engine_args: EngineArgs, stream.write(encryption_params.key) engine = LLMEngine.from_engine_args(engine_args) - if tensorizer_config._is_sharded: - # if the engine is a distributed engine (for tensor parallel) then each - # worker shard needs to serialize its part of the model. - engine.model_executor._run_workers( - "save_tensorized_model", - tensorizer_config=tensorizer_config, - ) - else: - # with a single worker, we can get to the underlying model directly - serialize_vllm_model( - engine.model_executor.driver_worker.model_runner.model, - tensorizer_config, - ) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index bb6b99135580e..e906b1789cde8 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -2,6 +2,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest @@ -10,6 +11,10 @@ from vllm.spec_decode.top1_proposer import Top1Proposer +class _DummyModel(nn.Module): + pass + + class NGramWorker(NonLLMProposerWorkerBase): """NGramWorker provides a light drafter without need for model. @@ -36,7 +41,6 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int, def init_device(self): self.device = torch.device(f"{self.device_type}:{self.local_rank}") - self.load_model = lambda *args, **kwargs: None # Current NGramWorker only supports Top1Proposer self._proposer = Top1Proposer( @@ -45,6 +49,12 @@ def init_device(self): vocab_size=self.vocab_size, ) + def load_model(self) -> None: + pass # Dummy + + def get_model(self) -> nn.Module: + return _DummyModel() + def sampler_output( self, execute_model_req: ExecuteModelRequest, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 8896b7dbc6b8a..c6ff5e52f9388 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,6 +1,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.distributed.parallel_state import (get_tp_group, init_model_parallel_group, @@ -15,6 +16,10 @@ logger = init_logger(__name__) +class _DummyModel(nn.Module): + pass + + class SmallerTpProposerWorker(ProposerWorkerBase): """Class which allows a speculative draft model to run with smaller tensor parallel degree than target model. @@ -139,6 +144,13 @@ def get_spec_proposals( return self._worker.get_spec_proposals( execute_model_req, seq_ids_with_bonus_token_in_last_step) + def get_model(self) -> nn.Module: + if self._is_dummy: + return _DummyModel() + + with self._patch_tensor_parallel_group(): + return self._worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 540d118d65ecb..0d66ede3d907a 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Type import torch +import torch.nn as nn from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig from vllm.distributed.communication_op import broadcast_tensor_dict @@ -403,6 +404,9 @@ def initialize_cache(self, num_gpu_blocks: int, self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.scorer_worker.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 93026029ad13e..f6cf35da0106b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -94,22 +94,12 @@ def collective_rpc(self, timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: - """ - Execute an RPC call on workers. - - Args: - method: Name of the worker method to execute - timeout: Maximum time in seconds to wait for execution. Rases a - TimeoutError on timeout. None means wait indefinitely. - args: Positional arguments to pass to the worker method - kwargs: Keyword arguments to pass to the worker method - - Returns: - List of results from each worker - """ start_time = time.monotonic() kwargs = kwargs or {} + # NOTE: If the args are heterogeneous, then we pack them into a list, + # and unpack them in the method of every worker, because every worker + # knows their own rank. try: if isinstance(method, str): send_method = method diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 87a1cd7f9e627..2350074c23a59 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -689,6 +689,9 @@ def _gather_encoder_outputs( encoder_outputs.append(encoder_output[start_idx:end_idx]) return encoder_outputs + def get_model(self) -> nn.Module: + return self.model + @torch.inference_mode() def execute_model( self, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 4fb4197f1822f..0929e64d58f1e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -5,6 +5,7 @@ import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig @@ -176,6 +177,9 @@ def compile_or_warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 303d9a15e9c3c..abbf6450ab7f6 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -509,6 +509,9 @@ def load_model(self) -> None: ) self.model = self.lora_manager.create_lora_manager(self.model) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 260ffaf27f9a1..4c8f69e449393 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -21,6 +21,7 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +import torch.nn as nn from vllm_hpu_extension.ops import LoraMask as LoraMask from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, HabanaMemoryProfiler, format_bytes) @@ -676,6 +677,9 @@ def load_model(self) -> None: msg = f"Loading model weights took in total {m.get_summary_string()}" logger.info(msg) + def get_model(self) -> nn.Module: + return self.model + def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ae8b7f97c827d..cb2ff0c934da3 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1176,6 +1176,9 @@ def load_model(self) -> None: fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) + def get_model(self) -> nn.Module: + return self.model + def save_sharded_state( self, path: str, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index c7abad7e0258d..acfd6d0b03f62 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -7,6 +7,7 @@ Optional, Type, TypeVar) import torch +import torch.nn as nn from torch import is_tensor from vllm.config import VllmConfig @@ -264,6 +265,10 @@ def prepare_model_input( """ raise NotImplementedError + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + def execute_model( self, model_input: T, @@ -297,9 +302,9 @@ class ModelRunnerWrapperBase: def __init__( self, - moderl_runner: ModelRunnerBase, + model_runner: ModelRunnerBase, ) -> None: - self.model_runner: ModelRunnerBase = moderl_runner + self.model_runner: ModelRunnerBase = model_runner def __getattr__(self, attr): return getattr(self.model_runner, attr) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index a35f5467e1a1f..596c26eac28bd 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -113,6 +113,9 @@ def load_model(self) -> None: raise NotImplementedError( "Supports only Transformer-NeuronX based models.") + def get_model(self) -> nn.Module: + return self.model + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index a38b5a4e6e8d5..9d0a759ca2f21 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -84,6 +84,9 @@ def load_model(self) -> None: kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 50a155d22c666..f5b46cde3969c 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -4,6 +4,7 @@ import openvino as ov import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.attention import get_attn_backend @@ -362,6 +363,9 @@ def cache_copy( ) -> None: self.cache_engine.copy(blocks_to_copy) # type: ignore + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 52c577bccab9c..f5c7bc955a673 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -158,6 +158,9 @@ def load_model(self) -> None: fullgraph=True, dynamic=False) + def get_model(self) -> nn.Module: + return self.model.model + def _dummy_run( self, batch_size: int, diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index fb9919f7a7b6a..1104eceef72a3 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -6,6 +6,7 @@ import cloudpickle import torch +import torch.nn as nn from vllm.config import ObservabilityConfig, VllmConfig from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group @@ -90,6 +91,11 @@ def start_worker_execution_loop(self) -> None: if output is None: return None + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + + @abstractmethod def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -147,6 +153,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -363,6 +372,9 @@ def prepare_input( else: return self._get_worker_input_from_broadcast() + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 82b8f22a5af33..25a2fea1e8eac 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -416,6 +416,9 @@ def load_model(self) -> None: logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) + def get_model(self) -> nn.Module: + return self.model + @property def vocab_size(self) -> int: return self.model_config.get_vocab_size()