vllm-project · maximzubkov · Mar 17, 2024 · Mar 17, 2024 · Mar 17, 2024 · Mar 17, 2024
@@ -694,6 +694,7 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI):
         prompt=("Generate a sql state that select col_1 from "
                 "table_1 where it is equals to 1"),
         temperature=1.0,
+        n=3,
         max_tokens=500,
         extra_body=dict(guided_grammar=simple_sql_grammar))
 

diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import json
 import math
 from collections import defaultdict
@@ -25,7 +26,7 @@
 from outlines.fsm.json_schema import build_regex_from_schema
 
 
-class BaseLogitsProcessor:
+class BaseGuidedLogitsProcessor:
 
     def adapt_tokenizer(self, tokenizer: PreTrainedTokenizerBase):
         """Adapt vLLM's tokenizer to use to compile the FSM.
@@ -81,9 +82,29 @@ def __call__(self, input_ids: List[int],
 
         seq_id = hash(tuple(input_ids))
 
-        if len(input_ids) == 0:
+        if not hasattr(self, "fsm_state"):
             self.init_state()
         else:
+            if not hasattr(self, "fsm_state"):
+                if len(input_ids) == 1:
+                    # This special scenario arises during sampling strategies
+                    # such as beam search when the number of sequences to be
+                    # generated (`n`) is bigger then 1.
+                    # During the initial step of beam search, only the input
+                    #`prompt` is given, while the beams themselves are yet
+                    # to be defined.
+                    # Consequently, the logits will have a shape of
+                    # [1, vocab_size].
+                    # Due to this `self.fsm_stat` initialization will be
+                    # triggered onlys for the very first `logits_processor`,
+                    # leaving the remaining `n-1` uninitialized.
+                    self.init_state()
+                    empty_seq_id = hash(tuple([]))
+                    self.fsm.allowed_token_ids(self.fsm_state[empty_seq_id])
+                else:
+                    raise ValueError(
+                        f"Multiple ids were generated: {input_ids}, "
+                        "while fsm is still not initialized")
             last_token = input_ids[-1]
             last_seq_id = hash(tuple(input_ids[:-1]))
             self.fsm_state[seq_id] = self.fsm.next_state(
@@ -99,8 +120,16 @@ def __call__(self, input_ids: List[int],
 
         return scores
 
+    def __deepcopy__(self, memo):
+        logits_processor = self.__class__(
+            copy.deepcopy(self.fsm.cfg_string, memo),
+            copy.deepcopy(self.fsm.tokenizer, memo),
+        )
+        logits_processor.fsm = self.fsm.copy()
+        return logits_processor
 
-class RegexLogitsProcessor(BaseLogitsProcessor):
+
+class RegexLogitsProcessor(BaseGuidedLogitsProcessor):
 
     def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the regex-structured generation.
@@ -120,10 +149,12 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
 
 class JSONLogitsProcessor(RegexLogitsProcessor):
 
-    def __init__(self,
-                 schema: Union[str, Dict, BaseModel],
-                 tokenizer: PreTrainedTokenizerBase,
-                 whitespace_pattern: Optional[str] = None):
+    def __init__(
+        self,
+        schema: Union[str, Dict, BaseModel],
+        tokenizer: PreTrainedTokenizerBase,
+        whitespace_pattern: Optional[str] = None,
+    ):
         """Compile the FSM that drives the JSON-guided generation.
 
         Parameters
@@ -154,7 +185,7 @@ def __init__(self,
         super().__init__(regex_string, tokenizer)
 
 
-class CFGLogitsProcessor(BaseLogitsProcessor):
+class CFGLogitsProcessor(BaseGuidedLogitsProcessor):
 
     def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the context free grammar generation.

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
@@ -155,18 +155,18 @@ def _apply_logits_processors(
     logits_row_idx = 0
     found_logits_processors = False
     for seq_ids, sampling_params in sampling_metadata.seq_groups:
-        logits_processors = sampling_params.logits_processors
-        if logits_processors:
-            found_logits_processors = True
-            for seq_id in seq_ids:
+        for i, seq_id in enumerate(seq_ids):
+            logits_processors = sampling_params.logits_processors[i]
+            if logits_processors:
+                found_logits_processors = True
                 logits_row = logits[logits_row_idx]
                 token_ids = sampling_metadata.seq_data[seq_id].output_token_ids
                 for logits_processor in logits_processors:
                     logits_row = logits_processor(token_ids, logits_row)
                 logits[logits_row_idx] = logits_row
                 logits_row_idx += 1
-        else:
-            logits_row_idx += len(seq_ids)
+            else:
+                logits_row_idx += len(seq_ids)
     if found_logits_processors:
         assert logits_row_idx == logits.shape[0]
     return logits

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
@@ -148,7 +148,14 @@ def __init__(
         self.prompt_logprobs = prompt_logprobs
         self.skip_special_tokens = skip_special_tokens
         self.spaces_between_special_tokens = spaces_between_special_tokens
-        self.logits_processors = logits_processors
+        # A separate logit processor is needed for each output sequence
+        # since certain logits processors (such as BaseGuidedLogitsProcessor)
+        # in multi-beam generation must track the sequences generated
+        # by each beam up to that point.
+        # See https://github.com/vllm-project/vllm/issues/3448 for more
+        self.logits_processors = [
+            copy.deepcopy(logits_processors) for _ in range(n)
+        ]
         self.include_stop_str_in_output = include_stop_str_in_output
         self._verify_args()
         if self.use_beam_search:
@@ -246,10 +253,10 @@ def clone(self) -> "SamplingParams":
         See https://github.com/vllm-project/vllm/issues/3087
         """
 
-        logit_processor_refs = None if self.logits_processors is None else {
+        logit_processor_refs = (None if self.logits_processors is None else {
             id(lp): lp
-            for lp in self.logits_processors
-        }
+            for lp in self.logits_processors if not hasattr(lp[0], "fsm")
+        })
         return copy.deepcopy(self, memo=logit_processor_refs)
 
     def __repr__(self) -> str: