vllm-project · maximzubkov · Mar 17, 2024 · Mar 17, 2024 · Mar 17, 2024 · Mar 17, 2024
@@ -726,20 +726,23 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI):
         prompt=("Generate a sql state that select col_1 from "
                 "table_1 where it is equals to 1"),
         temperature=1.0,
+        n=3,
         max_tokens=500,
         extra_body=dict(guided_grammar=simple_sql_grammar))
 
-    content = completion.choices[0].text
 
     # use Lark to parse the output, and make sure it's a valid parse tree
     from lark import Lark
-    parser = Lark(simple_sql_grammar)
-    parser.parse(content)
 
     # remove spaces for comparison b/c we removed them in the grammar
     ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")
 
-    assert content.strip() == ground_truth
+    for _ in range(3):
+        content = completion.choices[0].text
+        parser = Lark(simple_sql_grammar)
+        parser.parse(content)
+
+        assert content.strip() == ground_truth
 
 
 if __name__ == "__main__":

diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py
@@ -25,7 +25,7 @@
 from transformers import PreTrainedTokenizerBase
 
 
-class BaseLogitsProcessor:
+class BaseGuidedLogitsProcessor:
 
     def adapt_tokenizer(self, tokenizer: PreTrainedTokenizerBase):
         """Adapt vLLM's tokenizer to use to compile the FSM.
@@ -83,11 +83,30 @@ def __call__(self, input_ids: List[int],
 
         if len(input_ids) == 0:
             self.init_state()
-        else:
+        elif not hasattr(self, "fsm_state") and len(input_ids) == 1:
+            # This special scenario arises during sampling strategies
+            # such as beam search when the number of sequences to be
+            # generated (`n`) is bigger then 1.
+            # During the initial step of beam search, only the input
+            #`prompt` is given, while the beams themselves are yet
+            # to be defined.
+            # Consequently, the logits will have a shape of
+            # [1, vocab_size].
+            # Due to this `self.fsm_stat` initialization will be
+            # triggered onlys for the very first `logits_processor`,
+            # leaving the remaining `n-1` uninitialized.
+            self.init_state()
+            empty_seq_id = hash(tuple([]))
+            self.fsm.allowed_token_ids(self.fsm_state[empty_seq_id])
+
             last_token = input_ids[-1]
             last_seq_id = hash(tuple(input_ids[:-1]))
             self.fsm_state[seq_id] = self.fsm.next_state(
                 self.fsm_state[last_seq_id], last_token)
+        else:
+            raise ValueError(
+                f"Multiple ids were generated: {input_ids}, "
+                "while fsm is still not initialized")
 
         allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
 
@@ -100,7 +119,7 @@ def __call__(self, input_ids: List[int],
         return scores
 
 
-class RegexLogitsProcessor(BaseLogitsProcessor):
+class RegexLogitsProcessor(BaseGuidedLogitsProcessor):
 
     def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the regex-structured generation.
@@ -120,10 +139,12 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
 
 class JSONLogitsProcessor(RegexLogitsProcessor):
 
-    def __init__(self,
-                 schema: Union[str, Dict, BaseModel],
-                 tokenizer: PreTrainedTokenizerBase,
-                 whitespace_pattern: Optional[str] = None):
+    def __init__(
+        self,
+        schema: Union[str, Dict, BaseModel],
+        tokenizer: PreTrainedTokenizerBase,
+        whitespace_pattern: Optional[str] = None,
+    ):
         """Compile the FSM that drives the JSON-guided generation.
 
         Parameters
@@ -154,7 +175,7 @@ def __init__(self,
         super().__init__(regex_string, tokenizer)
 
 
-class CFGLogitsProcessor(BaseLogitsProcessor):
+class CFGLogitsProcessor(BaseGuidedLogitsProcessor):
 
     def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the context free grammar generation.
@@ -170,3 +191,11 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
         tokenizer = self.adapt_tokenizer(tokenizer)
         fsm = CFGFSM(cfg, tokenizer)
         self.fsm = fsm
+
+    def __deepcopy__(self, memo):
+        logits_processor = self.__class__(
+            self.fsm.cfg_string, memo,
+            self.fsm.tokenizer, memo,
+        )
+        logits_processor.fsm = self.fsm.copy()
+        return logits_processor
diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
@@ -87,18 +87,18 @@ def _apply_logits_processors(
     logits_row_idx = 0
     found_logits_processors = False
     for seq_ids, sampling_params in sampling_metadata.seq_groups:
-        logits_processors = sampling_params.logits_processors
-        if logits_processors:
-            found_logits_processors = True
-            for seq_id in seq_ids:
+        for i, seq_id in enumerate(seq_ids):
+            logits_processors = sampling_params.logits_processors[i]
+            if logits_processors:
+                found_logits_processors = True
                 logits_row = logits[logits_row_idx]
                 token_ids = sampling_metadata.seq_data[seq_id].output_token_ids
                 for logits_processor in logits_processors:
                     logits_row = logits_processor(token_ids, logits_row)
                 logits[logits_row_idx] = logits_row
                 logits_row_idx += 1
-        else:
-            logits_row_idx += len(seq_ids)
+            else:
+                logits_row_idx += len(seq_ids)
     if found_logits_processors:
         assert logits_row_idx == logits.shape[0]
     return logits
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
@@ -2,7 +2,7 @@
 import copy
 from enum import IntEnum
 from functools import cached_property
-from typing import Callable, List, Optional, Union
+from typing import Callable, List, Optional, Union, Sequence
 
 import torch
 from pydantic import conint
@@ -163,7 +163,14 @@ def __init__(
         self.detokenize = detokenize
         self.skip_special_tokens = skip_special_tokens
         self.spaces_between_special_tokens = spaces_between_special_tokens
-        self.logits_processors = logits_processors
+        # A separate logit processor is needed for each output sequence
+        # since certain logits processors (such as BaseGuidedLogitsProcessor)
+        # in multi-beam generation must track the sequences generated
+        # by each beam up to that point.
+        # See https://github.com/vllm-project/vllm/issues/3448 for more
+        self.logits_processors = [
+            copy.deepcopy(logits_processors) for _ in range(n)
+        ]
         self.include_stop_str_in_output = include_stop_str_in_output
         self.truncate_prompt_tokens = truncate_prompt_tokens
         self._verify_args()
@@ -278,11 +285,17 @@ def clone(self) -> "SamplingParams":
         arbitrary, nontrivial amount of data.
         See https://github.com/vllm-project/vllm/issues/3087
         """
+        logit_processor_refs = None 
+
+        if self.logits_processors:
+            logit_processor_refs = {}
+            for lp in self.logits_processors:
+                if lp:
+                    lp_first = lp[0] if isinstance(lp, Sequence) else lp
+                    if hasattr(lp_first, "fsm"):
+                        continue
+                logit_processor_refs[id(lp)] = lp
 
-        logit_processor_refs = None if self.logits_processors is None else {
-            id(lp): lp
-            for lp in self.logits_processors
-        }
         return copy.deepcopy(self, memo=logit_processor_refs)
 
     def __repr__(self) -> str: