diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index a11cd318d8949..058d765b49aab 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2781,6 +2781,11 @@ def _patch_prev_output(self): model_input = self.cached_step_inputs.pop(0) delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(-1).tolist() ctx = model_input.async_callback.keywords["ctx"] + # If there's no output to patch with, + # which is usually the case when we're starting a new request after all in-flight requests are completed, + # We return (Note that we have now cleared the cached_step_inputs/outputs as required). + if len(ctx.output_queue) == 0: + return assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' output_data = ctx.output_queue[0] assert len(output_data.outputs) == 1 @@ -2792,4 +2797,4 @@ def _patch_prev_output(self): # This is a hack. Assigning output_token_ids triggers # a cache recomputation and we only need to update the last token seq_data.output_token_ids_array[-1] = real_out - seq_data._cached_all_token_ids[-1] = real_out + seq_data._cached_all_token_ids[-1] = real_out \ No newline at end of file