diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b2f6c33858cbc..e90b76dcdd9ad 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -347,6 +347,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: self.input_batch.block_table.append_row(req_index, start_index, req_data.new_block_ids) + batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0 + # Add the new or resumed requests to the persistent batch. # The smaller empty indices are filled first. removed_req_indices = sorted(removed_req_indices, reverse=True) @@ -363,8 +365,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: # Condense the batched states if there are empty indices. if removed_req_indices: self.input_batch.condense(removed_req_indices) - return (len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0 - or len(scheduler_output.finished_req_ids) > 0) + + return batch_changed def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens