[Bugfix] Fix offline_inference_with_prefix.py (vllm-project#9505)

Signed-off-by: qishuai <ferdinandzhong@gmail.com>
FerdinandZhong · Oct 29, 2024 · 7a9113e · 7a9113e
1 parent f86b0f4
commit 7a9113e
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
@@ -29,11 +29,13 @@
 sampling_params = SamplingParams(temperature=0.0)
 
 # Create an LLM.
-regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
+regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3)
 
+# The second LLM needs to request a higher gpu_memory_utilization because
+# the first LLM has already allocated a full 30% of the gpu memory.
 prefix_cached_llm = LLM(model="facebook/opt-125m",
                         enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
+                        gpu_memory_utilization=0.6)
 print("Results without `enable_prefix_caching`")
 
 # Generate texts from the prompts. The output is a list of RequestOutput objects