HabanaAI · tianmu-li · Feb 20, 2025 · Feb 19, 2025 · Feb 19, 2025 · Feb 19, 2025
@@ -2781,6 +2781,11 @@ def _patch_prev_output(self):
         model_input = self.cached_step_inputs.pop(0)
         delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze(-1).tolist()
         ctx = model_input.async_callback.keywords["ctx"]
+        # If there's no output to patch with,
+        # which is usually the case when we're starting a new request after all in-flight requests are completed,
+        # We return (Note that we have now cleared the cached_step_inputs/outputs as required).
+        if len(ctx.output_queue) == 0: 
+            return
         assert len(ctx.output_queue) == 1, 'There should be exactly 1 output waiting!'
         output_data = ctx.output_queue[0]
         assert len(output_data.outputs) == 1
@@ -2792,4 +2797,4 @@ def _patch_prev_output(self):
             # This is a hack. Assigning output_token_ids triggers
             # a cache recomputation and we only need to update the last token
             seq_data.output_token_ids_array[-1] = real_out
-            seq_data._cached_all_token_ids[-1] = real_out
+            seq_data._cached_all_token_ids[-1] = real_out