Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bugfix] Fix beam search logits processor #3454

11 changes: 7 additions & 4 deletions tests/entrypoints/test_openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,20 +726,23 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI):
prompt=("Generate a sql state that select col_1 from "
"table_1 where it is equals to 1"),
temperature=1.0,
n=3,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since n = 3 here let's also verify all the outputs, you might also need to add use_beam_search to extra_body

max_tokens=500,
extra_body=dict(guided_grammar=simple_sql_grammar))

content = completion.choices[0].text

# use Lark to parse the output, and make sure it's a valid parse tree
from lark import Lark
parser = Lark(simple_sql_grammar)
parser.parse(content)

# remove spaces for comparison b/c we removed them in the grammar
ground_truth = "SELECT col_1 from table_1 where col_1 = 1".replace(" ", "")

assert content.strip() == ground_truth
for _ in range(3):
content = completion.choices[0].text
parser = Lark(simple_sql_grammar)
parser.parse(content)

assert content.strip() == ground_truth


if __name__ == "__main__":
Expand Down
45 changes: 37 additions & 8 deletions vllm/model_executor/guided_logits_processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from transformers import PreTrainedTokenizerBase


class BaseLogitsProcessor:
class BaseGuidedLogitsProcessor:

def adapt_tokenizer(self, tokenizer: PreTrainedTokenizerBase):
"""Adapt vLLM's tokenizer to use to compile the FSM.
Expand Down Expand Up @@ -83,11 +83,30 @@ def __call__(self, input_ids: List[int],

if len(input_ids) == 0:
self.init_state()
else:
elif not hasattr(self, "fsm_state") and len(input_ids) == 1:
# This special scenario arises during sampling strategies
# such as beam search when the number of sequences to be
# generated (`n`) is bigger then 1.
# During the initial step of beam search, only the input
#`prompt` is given, while the beams themselves are yet
# to be defined.
# Consequently, the logits will have a shape of
# [1, vocab_size].
# Due to this `self.fsm_stat` initialization will be
# triggered onlys for the very first `logits_processor`,
# leaving the remaining `n-1` uninitialized.
self.init_state()
empty_seq_id = hash(tuple([]))
self.fsm.allowed_token_ids(self.fsm_state[empty_seq_id])

last_token = input_ids[-1]
last_seq_id = hash(tuple(input_ids[:-1]))
self.fsm_state[seq_id] = self.fsm.next_state(
self.fsm_state[last_seq_id], last_token)
else:
raise ValueError(
f"Multiple ids were generated: {input_ids}, "
"while fsm is still not initialized")

allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])

Expand All @@ -100,7 +119,7 @@ def __call__(self, input_ids: List[int],
return scores


class RegexLogitsProcessor(BaseLogitsProcessor):
class RegexLogitsProcessor(BaseGuidedLogitsProcessor):

def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
"""Compile the FSM that drives the regex-structured generation.
Expand All @@ -120,10 +139,12 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):

class JSONLogitsProcessor(RegexLogitsProcessor):

def __init__(self,
schema: Union[str, Dict, BaseModel],
tokenizer: PreTrainedTokenizerBase,
whitespace_pattern: Optional[str] = None):
def __init__(
self,
schema: Union[str, Dict, BaseModel],
tokenizer: PreTrainedTokenizerBase,
whitespace_pattern: Optional[str] = None,
):
"""Compile the FSM that drives the JSON-guided generation.

Parameters
Expand Down Expand Up @@ -154,7 +175,7 @@ def __init__(self,
super().__init__(regex_string, tokenizer)


class CFGLogitsProcessor(BaseLogitsProcessor):
class CFGLogitsProcessor(BaseGuidedLogitsProcessor):

def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
"""Compile the FSM that drives the context free grammar generation.
Expand All @@ -170,3 +191,11 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
tokenizer = self.adapt_tokenizer(tokenizer)
fsm = CFGFSM(cfg, tokenizer)
self.fsm = fsm

def __deepcopy__(self, memo):
logits_processor = self.__class__(
self.fsm.cfg_string, memo,
self.fsm.tokenizer, memo,
)
logits_processor.fsm = self.fsm.copy()
return logits_processor
12 changes: 6 additions & 6 deletions vllm/model_executor/layers/logits_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,18 +87,18 @@ def _apply_logits_processors(
logits_row_idx = 0
found_logits_processors = False
for seq_ids, sampling_params in sampling_metadata.seq_groups:
logits_processors = sampling_params.logits_processors
if logits_processors:
found_logits_processors = True
for seq_id in seq_ids:
for i, seq_id in enumerate(seq_ids):
logits_processors = sampling_params.logits_processors[i]
if logits_processors:
found_logits_processors = True
logits_row = logits[logits_row_idx]
token_ids = sampling_metadata.seq_data[seq_id].output_token_ids
for logits_processor in logits_processors:
logits_row = logits_processor(token_ids, logits_row)
logits[logits_row_idx] = logits_row
logits_row_idx += 1
else:
logits_row_idx += len(seq_ids)
else:
logits_row_idx += len(seq_ids)
if found_logits_processors:
assert logits_row_idx == logits.shape[0]
return logits
25 changes: 19 additions & 6 deletions vllm/sampling_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import copy
from enum import IntEnum
from functools import cached_property
from typing import Callable, List, Optional, Union
from typing import Callable, List, Optional, Union, Sequence

import torch
from pydantic import conint
Expand Down Expand Up @@ -163,7 +163,14 @@ def __init__(
self.detokenize = detokenize
self.skip_special_tokens = skip_special_tokens
self.spaces_between_special_tokens = spaces_between_special_tokens
self.logits_processors = logits_processors
# A separate logit processor is needed for each output sequence
# since certain logits processors (such as BaseGuidedLogitsProcessor)
# in multi-beam generation must track the sequences generated
# by each beam up to that point.
# See https://github.com/vllm-project/vllm/issues/3448 for more
self.logits_processors = [
copy.deepcopy(logits_processors) for _ in range(n)
]
self.include_stop_str_in_output = include_stop_str_in_output
self.truncate_prompt_tokens = truncate_prompt_tokens
self._verify_args()
Expand Down Expand Up @@ -278,11 +285,17 @@ def clone(self) -> "SamplingParams":
arbitrary, nontrivial amount of data.
See https://github.com/vllm-project/vllm/issues/3087
"""
logit_processor_refs = None

if self.logits_processors:
logit_processor_refs = {}
for lp in self.logits_processors:
if lp:
lp_first = lp[0] if isinstance(lp, Sequence) else lp
if hasattr(lp_first, "fsm"):
continue
logit_processor_refs[id(lp)] = lp

logit_processor_refs = None if self.logits_processors is None else {
id(lp): lp
for lp in self.logits_processors
}
return copy.deepcopy(self, memo=logit_processor_refs)

def __repr__(self) -> str:
Expand Down
Loading