From c851d6990ab2c9ecc083eb22ed3ef255605ec908 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Tue, 13 Jun 2023 13:54:50 +0200 Subject: [PATCH 1/5] Fix float mask In some vocabularies the only token that contains a period is the "." period. However, the current regex used to create masks to generate floats excludes this token. This commit edits the regex to allow for the "." token. --- outlines/text/masks.py | 2 +- tests/text/test_masks.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/outlines/text/masks.py b/outlines/text/masks.py index a7e7b4f52..c57625736 100644 --- a/outlines/text/masks.py +++ b/outlines/text/masks.py @@ -42,7 +42,7 @@ def create_int_mask(vocabulary: Dict[str, int]) -> np.ndarray: def create_float_mask(vocabulary: Dict[str, int]) -> np.ndarray: """Create a mask to generate floating point numbers.""" - mask = create_mask_from_regex(vocabulary, r"^([0-9]+([.][0-9]*)?|[.][0-9]+)$") + mask = create_mask_from_regex(vocabulary, r"^(([0-9]+)?([.]([0-9]*)?)?|[.][0-9]+)$") return mask diff --git a/tests/text/test_masks.py b/tests/text/test_masks.py index c9d37353e..3c0dc782c 100644 --- a/tests/text/test_masks.py +++ b/tests/text/test_masks.py @@ -24,11 +24,12 @@ def test_float_mask(): "1.": 5, "0.": 6, "1.2.3": 7, + ".": 8, } mask = create_float_mask(vocabulary) assert_array_equal( - mask, np.array([True, True, False, False, True, True, True, False]) + mask, np.array([True, True, False, False, True, True, True, False, True]) ) From 7d5aae9b3c050455e6a5547b82cc96b07a59a43a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Tue, 20 Jun 2023 14:12:09 +0200 Subject: [PATCH 2/5] Add `Tokenizer` base class --- outlines/models/tokenizer.py | 23 +++++++++++++++++++++++ pyproject.toml | 1 + tests/models/test_tokenizer.py | 8 ++++++++ 3 files changed, 32 insertions(+) create mode 100644 outlines/models/tokenizer.py create mode 100644 tests/models/test_tokenizer.py diff --git a/outlines/models/tokenizer.py b/outlines/models/tokenizer.py new file mode 100644 index 000000000..84c317dd7 --- /dev/null +++ b/outlines/models/tokenizer.py @@ -0,0 +1,23 @@ +from abc import abstractmethod +from typing import List, Protocol, Tuple, Union + +import numpy as np +from numpy.typing import NDArray + + +class Tokenizer(Protocol): + eos_token: str + eos_token_id: int + pad_token_id: int + + @abstractmethod + def encode( + self, prompt: Union[str, List[str]] + ) -> Tuple[NDArray[np.int64], NDArray[np.int64]]: + """Translate the input prompts into NumPy arrays of token ids and attention mask.""" + ... + + @abstractmethod + def decode(self, token_ids: NDArray[np.int64]) -> List[str]: + """Translate an array of token ids to a string or list of strings.""" + ... diff --git a/pyproject.toml b/pyproject.toml index 8e3e8cd6b..92d7a8d19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,6 +104,7 @@ omit = [ exclude_lines = [ "pragma: no cover", "if TYPE_CHECKING:", + "...", ] show_missing = true diff --git a/tests/models/test_tokenizer.py b/tests/models/test_tokenizer.py new file mode 100644 index 000000000..831f7fe3e --- /dev/null +++ b/tests/models/test_tokenizer.py @@ -0,0 +1,8 @@ +import pytest + +from outlines.models.tokenizer import Tokenizer + + +def test_tokenizer(): + with pytest.raises(TypeError, match="instantiate abstract"): + Tokenizer() From 98f9a04e1d40a3eb8ae8a26d7700b26434fd35f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 15 Jun 2023 11:07:30 +0200 Subject: [PATCH 3/5] Add the `Transformers` model --- outlines/models/__init__.py | 1 + outlines/models/transformers.py | 92 +++++++++++++++++++++++++++++++ pyproject.toml | 2 +- tests/models/test_transformers.py | 67 ++++++++++++++++++++++ 4 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 outlines/models/transformers.py create mode 100644 tests/models/test_transformers.py diff --git a/outlines/models/__init__.py b/outlines/models/__init__.py index 7e9029587..53653f0e6 100644 --- a/outlines/models/__init__.py +++ b/outlines/models/__init__.py @@ -9,3 +9,4 @@ from .hf_diffusers import HuggingFaceDiffuser from .hf_transformers import HuggingFaceCompletion from .openai import OpenAICompletion, OpenAIEmbeddings, OpenAIImageGeneration +from .transformers import transformers diff --git a/outlines/models/transformers.py b/outlines/models/transformers.py new file mode 100644 index 000000000..d71272f79 --- /dev/null +++ b/outlines/models/transformers.py @@ -0,0 +1,92 @@ +import math +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +import numpy as np +from numpy.typing import NDArray + +from outlines.models.tokenizer import Tokenizer + +if TYPE_CHECKING: + from transformers import PreTrainedModel, PreTrainedTokenizer + + +__all__ = ["transformers"] + + +class Transformers: + """Represents a `transformers` model.""" + + def __init__( + self, + model: "PreTrainedModel", + tokenizer: "PreTrainedTokenizer", + device: Optional[str] = None, + ): + self.device = device if device is not None else "cpu" + self.model = model.to(self.device) + self.tokenizer = tokenizer + + def __call__( + self, input_ids: NDArray[np.int64], attention_mask: NDArray[np.int64] + ) -> NDArray[np.float64]: + import torch + + # `transformers` model accept `input_ids` of size at most equal to 2. We + # thus reshape the input array, call the model and reshape the output + # logits. + batch_shape = input_ids.shape[:-1] + num_tokens = input_ids.shape[-1] + input_ids = input_ids.reshape(math.prod(batch_shape), num_tokens) + + with torch.no_grad(): + input_ids = torch.from_numpy(input_ids).to(self.device) + attention_mask = torch.from_numpy(attention_mask).to(self.device) + + output = self.model(input_ids, attention_mask=attention_mask) + + next_token_logits = output.logits[:, -1, :] + probs = torch.nn.functional.softmax(next_token_logits, dim=-1).squeeze() + probs = torch.atleast_2d(probs) + numpy_probs = probs.cpu().detach().numpy() + + return numpy_probs.reshape(batch_shape + (-1,)) + + +class TransformersTokenizer(Tokenizer): + """Represents a tokenizer for models in the `transformers` library.""" + + def __init__(self, model_name: str, **kwargs): + from transformers import AutoTokenizer + + kwargs.setdefault("padding_side", "left") + self.tokenizer = AutoTokenizer.from_pretrained(model_name, **kwargs) + self.eos_token_id = self.tokenizer.eos_token_id + self.eos_token = self.tokenizer.eos_token + + if not self.tokenizer.pad_token_id: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.pad_token_id = self.eos_token_id + else: + self.pad_token_id = self.tokenizer.pad_token_id + self.pad_token = self.tokenizer.pad_token + + def encode( + self, prompt: Union[str, List[str]], **kwargs + ) -> Tuple[NDArray[np.int64], NDArray[np.int64]]: + kwargs["padding"] = True + kwargs["return_tensors"] = "np" + output = self.tokenizer(prompt, **kwargs) + return output["input_ids"], output["attention_mask"] + + def decode(self, token_ids: NDArray[np.int64]) -> List[str]: + text = self.tokenizer.batch_decode(token_ids) + return text + + +def transformers(model_name: str, device: Optional[str] = None, **model_kwargs): + from transformers import AutoModelForCausalLM + + model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs) + tokenizer = TransformersTokenizer(model_name) + + return Transformers(model, tokenizer, device) diff --git a/pyproject.toml b/pyproject.toml index 92d7a8d19..01222eeb9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,7 +86,7 @@ module = [ "tenacity.*", "tiktoken.*", "torch", - "transformers", + "transformers.*", ] ignore_missing_imports = true diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py new file mode 100644 index 000000000..1d7bcb40a --- /dev/null +++ b/tests/models/test_transformers.py @@ -0,0 +1,67 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal +from transformers.models.gpt2 import GPT2TokenizerFast + +from outlines.models.transformers import TransformersTokenizer, transformers + +TEST_MODEL = "hf-internal-testing/tiny-random-GPTJForCausalLM" + + +def test_tokenizer(): + tokenizer = TransformersTokenizer(TEST_MODEL) + assert tokenizer.eos_token_id == 0 + assert tokenizer.pad_token_id == 0 + assert isinstance(tokenizer.tokenizer, GPT2TokenizerFast) + + token_ids, attention_mask = tokenizer.encode("Test") + assert token_ids.ndim == 2 + assert token_ids.shape[0] == 1 + assert isinstance(token_ids, np.ndarray) + assert token_ids.shape == attention_mask.shape + + token_ids, attention_mask = tokenizer.encode(["Test", "Test"]) + assert token_ids.ndim == 2 + assert token_ids.shape[0] == 2 + assert isinstance(token_ids, np.ndarray) + assert token_ids.shape == attention_mask.shape + + token_ids, attention_mask = tokenizer.encode(["Test", "A long sentence"]) + assert token_ids.shape == attention_mask.shape + assert attention_mask[0][0] == tokenizer.pad_token_id + + text = tokenizer.decode(np.array([[0, 1, 2]])) + isinstance(text, str) + + text = tokenizer.decode(np.array([[0, 1, 2], [3, 4, 5]])) + isinstance(text, list) + isinstance(text[0], str) + isinstance(text[1], str) + + +def test_model(): + with pytest.raises(RuntimeError, match="Expected one of cpu, cuda"): + transformers(TEST_MODEL, device="non_existent") + + model = transformers(TEST_MODEL, device="cpu") + assert isinstance(model.tokenizer, TransformersTokenizer) + assert model.device == "cpu" + + input_ids = np.array([[0, 1, 2]]) + logits = model(input_ids, np.ones_like(input_ids)) + assert isinstance(logits, np.ndarray) + assert logits.ndim == 2 + assert logits.shape[0] == 1 + + input_ids = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]]) + logits = model(input_ids, np.ones_like(input_ids)) + assert isinstance(logits, np.ndarray) + assert logits.ndim == 2 + assert logits.shape[0] == 3 + + input_ids = np.array([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [0, 1, 2]]]) + logits = model(input_ids, np.ones_like(input_ids)) + assert logits.ndim == 3 + assert logits.shape[0] == 2 + assert logits.shape[1] == 2 + assert_array_equal(logits[0][0], logits[1][1]) From 3ee77bfb3dcd0797d88baa4f2f736ad5a85a91ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 15 Jun 2023 14:44:55 +0200 Subject: [PATCH 4/5] Add the `Sequence` base class --- outlines/text/sequences/sequence.py | 250 ++++++++++++++++ pyproject.toml | 2 +- tests/text/sequences/test_sequence.py | 393 ++++++++++++++++++++++++++ 3 files changed, 644 insertions(+), 1 deletion(-) create mode 100644 outlines/text/sequences/sequence.py create mode 100644 tests/text/sequences/test_sequence.py diff --git a/outlines/text/sequences/sequence.py b/outlines/text/sequences/sequence.py new file mode 100644 index 000000000..bea23de4c --- /dev/null +++ b/outlines/text/sequences/sequence.py @@ -0,0 +1,250 @@ +from typing import List, Optional, Tuple, Union + +import numpy as np +from numpy.random import Generator +from numpy.typing import NDArray + + +class Sequence: + """Represents a sequence generation method.""" + + def __init__(self, model, max_tokens: Optional[int] = None): + """Create a `Sequence` instance. + + Parameters + ---------- + model + The instance of the model used to generate next-token probabilities. + max_tokens + The maximum number of tokens that will be generated if no termination + condition is met. + + """ + self.model = model + self.max_tokens = max_tokens + + def is_finished(self, token_ids: NDArray[np.int64]) -> NDArray[np.bool_]: + """Determine whether we should stop the generation.""" + raise NotImplementedError( + "`Sequence.is_finished` must be implemented by subclasses." + ) + + def step( + self, + rng: Generator, + token_ids: NDArray[np.int64], + attention_mask: NDArray[np.int64], + samples: int = 1, + ) -> Tuple[NDArray[np.int64], NDArray[float]]: + """Generate one or several tokens that complete the input sequence. + + The sampling step consists in using a model to generate next-token + logits and then sample `samples`-many new tokens from a categorical + distribution parametrized by these logits. + + Parameters + ---------- + rng + NumPy random number Generator instance + token_ids + The token ids passed as an input to the model, of shape `batch_shape + + (num_tokens,)`, where `num_tokens` is the sequences' length. + samples + The number of continuations to sample from the next-token probability + distribution. + + Returns + ------- + A tuple with an array of shape `new_batch_shape + (num_tokens+1,)`that + contains the completed sequences (input token ids and generated token + ids) and an array of shape `new_batch_shape + (vocab_size,)` that + contains the next token probabilities. + `new_batch_shape` is computed by removing dimensions of size one in + `(samples,) + batch_shape`. + + """ + num_input_dims = token_ids.ndim + probs = self.model(token_ids, attention_mask) + + # Sample `samples`-many new tokens + next_token_ids = vectorized_random_choice(rng, probs, samples) + + # Add the missing `num_tokens` and `num_sample` dimensions + next_token_ids = np.expand_dims(next_token_ids, -1) + token_ids = np.expand_dims(token_ids, 0) + + # Expand the input `token_ids` array to be able to concatenate several + # samples. + if samples > 1: + repetitions = (samples,) + (1,) * num_input_dims + token_ids = np.tile(token_ids, repetitions) + probs = np.tile(probs, repetitions) + + token_ids = np.concatenate([token_ids, next_token_ids], axis=-1) + + # Merge sample and batch dimensions by removing dimensions of length + # 1. The shape of the resulting arrays is `new_batch_shape + (num_tokens,)` + # and `new_batch_shape + (vocab_size,)` respectively. + token_ids = np.atleast_2d(token_ids.squeeze()) + probs = np.atleast_2d(probs.squeeze()) + + return token_ids, probs + + def expand_attention_mask( + self, attention_mask: NDArray[np.int64] + ) -> NDArray[np.int64]: + """Expand the attention mask after the last completion.""" + batch_shape = attention_mask.shape[:-1] + attention_mask = np.concatenate( + [attention_mask, np.broadcast_to([1], batch_shape + (1,))], axis=-1 + ) + return attention_mask + + def update_token_ids( + self, + is_finished: NDArray[np.bool_], + token_ids: NDArray[np.int64], + token_ids_unfinished: NDArray[np.int64], + ) -> NDArray[np.int64]: + """Update the array of token ids after the last completion. + + We only generate new tokens for the sequences that are not finished. We thus + update the array with the new tokens, and append pad tokens to the finished + sequences. + + Parameters + ---------- + is_finished + Boolean array that indicates which sequences are finished. + token_ids + Array that contains the sequences before the generation's last step. + token_ids_unfinished + Array that contains the sequences of the unfinished sequences + after the generation's last step. + + Returns + ------- + An array that contains the updated array that contains the sequences. We append + pad tokens to the finished sequences. + + """ + batch_shape = token_ids.shape[:-1] + num_tokens = token_ids.shape[-1] + new_token_ids = np.empty(batch_shape + (num_tokens + 1,), dtype=np.int64) + + token_ids_finished = token_ids[is_finished] + batch_shape_finished = token_ids_finished.shape[:-1] + token_ids_finished = np.concatenate( + [ + token_ids_finished, + np.broadcast_to( + [self.model.tokenizer.pad_token_id], batch_shape_finished + (1,) + ), + ], + axis=-1, + ) + + new_token_ids[~is_finished] = token_ids_unfinished + new_token_ids[is_finished] = token_ids_finished + + return new_token_ids + + def __call__( + self, + prompt: Union[str, List[str]], + samples: int = 1, + rng: Generator = np.random.default_rng(), + ) -> Union[str, List[str]]: + """Generate a new sequence given a prompt. + + Parameters + ---------- + prompt + The input prompt. + samples + The number of samples to generate for each prompt. + + Returns + ------- + The full sequence that contains the prompts and the generated string. + + """ + token_ids, attention_mask = self.model.tokenizer.encode(prompt) + num_prompt_tokens = token_ids.shape[-1] + + if samples > 1: + token_ids, _ = self.step(rng, token_ids, attention_mask, samples) + is_finished = self.is_finished(token_ids) + + num_batch_dims = token_ids.ndim - 1 + repetitions = (samples,) + (1,) * num_batch_dims + attention_mask = np.tile(attention_mask, repetitions) + attention_mask = self.expand_attention_mask(attention_mask) + else: + batch_shape = token_ids.shape[:-1] + is_finished = np.zeros(batch_shape, dtype=np.bool_) + + while True: + num_generated_tokens = token_ids.shape[-1] - num_prompt_tokens + if np.all(is_finished) or num_generated_tokens == self.max_tokens: + break + + token_ids_unfinished = token_ids[~is_finished] + attention_mask_unfinished = attention_mask[~is_finished] + token_ids_unfinished, _ = self.step( + rng, token_ids_unfinished, attention_mask_unfinished + ) + + token_ids = self.update_token_ids( + is_finished, token_ids, token_ids_unfinished + ) + attention_mask = self.expand_attention_mask(attention_mask) + is_finished[~is_finished] = self.is_finished(token_ids_unfinished).flatten() + + result = self.model.tokenizer.decode(token_ids) + + if len(result) == 1: + return result[0] + + return result + + +vsearchsorted = np.vectorize(np.searchsorted, otypes=[int], signature="(n),()->()") + + +def vectorized_random_choice( + rng: Generator, + p: NDArray[np.float64], + samples: int = 1, +): + """Vectorized implementation of `np.random.choice`. + + `np.random.choice` does not support arrays of probability. This implements + the equivalent of this function where the `p` argument can be a matrix. + + Note + ---- + `searchsorted` might be more efficient here since the number of elements + can be quite large. + + Parameters + ---------- + rng + NumPy random number Generator instance + p + An array of probability of shape `(num_probability_vectors, num_items)` + that must sum to 1. + samples + The number of samples to take for each probability vector. + + Returns + ------- + An array of shape `(num_samples, batch_size)` + + """ + + cumsum = np.expand_dims(p.cumsum(axis=-1), 0) + rand = rng.random((samples,) + p.shape[:-1]) + idx = vsearchsorted(cumsum, rand) + + return idx diff --git a/pyproject.toml b/pyproject.toml index 01222eeb9..62c7ae99a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,8 +75,8 @@ module = [ "diffusers", "jinja2", "joblib", - "numpy.*", "openai", + "numpy.*", "perscache.*", "PIL", "PIL.Image", diff --git a/tests/text/sequences/test_sequence.py b/tests/text/sequences/test_sequence.py new file mode 100644 index 000000000..946990102 --- /dev/null +++ b/tests/text/sequences/test_sequence.py @@ -0,0 +1,393 @@ +from typing import Dict, List, Union + +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from outlines.text.sequences.sequence import Sequence, vectorized_random_choice + + +def test_vectorized_random_choice(): + rng = np.random.default_rng(0) + + probs = np.array([[1, 0, 0, 0]]) + sample = vectorized_random_choice(rng, probs) + assert sample.shape == (1, 1) + assert_array_equal(sample, np.zeros((1, 1))) + + probs = np.array([[1, 0, 0, 0]]) + sample = vectorized_random_choice(rng, probs, samples=3) + assert sample.shape == (3, 1) + assert_array_equal(sample, np.zeros((3, 1))) + + probs = np.tile(np.array([[1, 0, 0, 0]]), (2, 1)) + sample = vectorized_random_choice(rng, probs) + assert sample.shape == (1, 2) + assert_array_equal(sample, np.zeros((1, 2))) + + probs = np.array([[1, 0, 0, 0], [0, 1, 0, 0]]) + sample = vectorized_random_choice(rng, probs, samples=3) + assert sample.shape == (3, 2) + assert_array_equal(sample, [[0, 1], [0, 1], [0, 1]]) + + probs = np.array([[[1, 0, 0, 0], [0, 1, 0, 0]], [[0, 0, 1, 0], [0, 0, 0, 1]]]) + sample = vectorized_random_choice(rng, probs, samples=3) + assert sample.shape == (3, 2, 2) + assert_array_equal(sample, [[[0, 1], [2, 3]], [[0, 1], [2, 3]], [[0, 1], [2, 3]]]) + + +def test_sequence_error(): + with pytest.raises(NotImplementedError, match="must be implemented"): + sequence = Sequence(None) + sequence.is_finished(np.array([1])) + + +def ModelStep(logits): + """Mock model to test `Sequence.step`""" + + logits = np.array([logits]) + + def call(input_ids, *_): + """Call the model. + + We first repeat the logits `num_sequences` times, and then + reshape the resulting array to match the batch size. + + """ + import math + + batch_shape = input_ids.shape[:-1] + vocab_shape = (logits.shape[-1],) + shaped_logits = np.tile(logits, (math.prod(batch_shape), 1)) + return shaped_logits.reshape(batch_shape + vocab_shape) + + return call + + +def test_sequence_step(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + + input_ids = np.array([[1, 2]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((1, 2))) + assert_array_equal(token_ids, [[1, 2, 1]]) + assert probs.shape == (1, 4) + + +def test_sequence_step_batch(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + + input_ids = np.array([[1, 2], [3, 4]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((2, 2))) + assert_array_equal(token_ids, [[1, 2, 1], [3, 4, 1]]) + assert probs.shape == (2, 4) + + +def test_sequence_step_sample(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((1, 2)), samples=3) + assert_array_equal(token_ids, [[1, 2, 1], [1, 2, 1], [1, 2, 1]]) + assert probs.shape == (3, 4) + + +def test_sequence_sample_batch(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2, 1], [3, 4, 1]]) + token_ids, probs = sequence.step(rng, input_ids, np.ones((2, 3)), samples=3) + assert_array_equal( + token_ids, + [ + [[1, 2, 1, 1], [3, 4, 1, 1]], + [[1, 2, 1, 1], [3, 4, 1, 1]], + [[1, 2, 1, 1], [3, 4, 1, 1]], + ], + ) + assert probs.shape == (3, 2, 4) + + +def test_sequence_step_loop(): + """Make sure that we can feed `step`'s output back as an input.""" + + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 2))) + token_ids, probs = sequence.step(rng, token_ids, np.ones((1, 3))) + assert_array_equal(token_ids, [[1, 2, 1, 1]]) + assert probs.shape == (1, 4) + + input_ids = np.array([[1, 2], [3, 4]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((2, 2))) + token_ids, probs = sequence.step(rng, token_ids, np.ones((2, 3))) + assert_array_equal(token_ids, [[1, 2, 1, 1], [3, 4, 1, 1]]) + assert probs.shape == (2, 4) + + # The number of samples becomes the batch size at the next iteration. + input_ids = np.array([[1, 2]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 2)), samples=3) + token_ids, probs = sequence.step(rng, token_ids, np.ones((3, 3))) + assert_array_equal(token_ids, [[1, 2, 1, 1], [1, 2, 1, 1], [1, 2, 1, 1]]) + assert probs.shape == (3, 4) + + +def test_sequence_step_loop_general(): + rng = np.random.default_rng(0) + + logits = np.array([0, 1, 0, 0]) + model = ModelStep(logits) + + sequence = Sequence(model) + input_ids = np.array([[1, 2, 1], [3, 4, 1]]) + token_ids, _ = sequence.step(rng, input_ids, np.ones((1, 3)), samples=3) + result, _ = sequence.step(rng, token_ids, np.ones((3, 4))) + assert result.shape == (3, 2, 5) + assert_array_equal( + result, + [ + [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]], + [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]], + [[1, 2, 1, 1, 1], [3, 4, 1, 1, 1]], + ], + ) + + +class TokenizerUpdateTokens: + pad_token_id = -1 + + +class ModelUpdateTokens: + tokenizer = TokenizerUpdateTokens() + + +def test_update_token_ids_all_unfinished(): + sequence = Sequence(ModelUpdateTokens()) + + previous_token_ids = np.array([[1, 1], [1, 1]]) + is_finished = np.array([False, False]) + token_ids_unfinished = np.array([[1, 1, 1], [1, 1, 1]]) + + result = sequence.update_token_ids( + is_finished, previous_token_ids, token_ids_unfinished + ) + assert_array_equal(result, [[1, 1, 1], [1, 1, 1]]) + + +def test_update_token_ids_some_unfinished(): + "Makes sure that the pad token is appended to finished sequences." + sequence = Sequence(ModelUpdateTokens()) + + previous_token_ids = np.array([[1, 1], [1, 1]]) + token_ids_unfinished = np.array([[1, 1, 1]]) + is_finished = np.array([True, False]) + result = sequence.update_token_ids( + is_finished, previous_token_ids, token_ids_unfinished + ) + assert_array_equal(result, [[1, 1, -1], [1, 1, 1]]) + + +@pytest.mark.xfail +def test_update_token_ids_larger_dimensions(): + sequence = Sequence(ModelUpdateTokens()) + + previous_token_ids = np.array([[1, 1], [1, 1]]) + is_finished = np.array([False, False]) + token_ids_unfinished = np.array([[1, 1, 1], [1, 1, 1]]) + result = sequence.update_token_ids( + is_finished, previous_token_ids, token_ids_unfinished + ) + assert_array_equal(result, [[1, 1, -1], [1, 1, 1]]) + + +class MockModel: + def __init__(self, tokenizer, logits): + self.tokenizer = tokenizer + self.logits = np.array(logits) + self.iteration_idx = 0 + + def __call__(self, input_ids, *_): + import math + + batch_shape = input_ids.shape[:-1] + vocab_shape = (self.logits.shape[-1],) + shaped_logits = np.tile( + self.logits[self.iteration_idx], (math.prod(batch_shape), 1) + ) + self.iteration_idx += 1 + + return shaped_logits.reshape(batch_shape + vocab_shape) + + +class MockTokenizer: + def __init__(self, vocabulary: Dict[str, int]): + self.vocabulary = vocabulary + self.pad_token_id = -1 + + def encode(self, prompts: Union[str, List[str]]): + if isinstance(prompts, str): + prompts = [prompts] + + token_ids = np.array([[self.vocabulary[prompt]] for prompt in prompts]) + attention_mask = np.ones_like(token_ids) + + return token_ids, attention_mask + + def decode(self, token_ids): + return token_ids + + +def test_call_single_prompt(): + class FinishAfterTwo(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + """Finish generating the sequence after two iterations""" + if self.iteration_idx == 0: + self.iteration_idx += 1 + return np.array([False]) + else: + return np.array([True]) + + tokenizer = MockTokenizer({"Test": 0, "a": 1, "b": 2}) + model = MockModel(tokenizer, [[1, 0, 0], [0, 1, 0]]) + sequence = FinishAfterTwo(model) + + result = sequence("Test") + assert_array_equal(result, [0, 0, 1]) + + +def test_call_prompt_list(): + class Tokenizer: + def __init__(self, vocabulary: Dict[str, int]): + self.vocabulary = vocabulary + self.pad_token_id = -1 + + def __call__(self, prompts: List[str], **_): + return { + "input_ids": np.array([[self.vocabulary[prompt]] for prompt in prompts]) + } + + def batch_decode(self, token_ids): + return token_ids + + class FinishAfterThree(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + """Finish generating the first sequence after two iteration and the + second one after two iterations. + + """ + if self.iteration_idx == 0: + self.iteration_idx += 1 + return np.array([False, False, False]) + elif self.iteration_idx == 1: + self.iteration_idx += 1 + return np.array([True, False, True]) + else: + return np.array([True]) # We only consider the unfinished sequences + + tokenizer = MockTokenizer( + {"Test1": 0, "Test2": 1, "a": 2, "b": 3, "c": 4, "Test3": 5} + ) + model = MockModel( + tokenizer, + [[0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 1, 0]], + ) + sequence = FinishAfterThree(model) + + result = sequence(["Test1", "Test2", "Test3"]) + assert_array_equal(result, [[0, 2, 3, -1], [1, 2, 3, 4], [5, 2, 3, -1]]) + + +def test_call_single_prompt_samples(): + class FinishAfterTwo(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + if self.iteration_idx == 0: + self.iteration_idx += 1 + return np.array([False, False, False]) + else: + return np.array([True, True, True]) + + tokenizer = MockTokenizer({"a": 0, "b": 1, "c": 2, "Test": 4}) + model = MockModel(tokenizer, [[1, 0, 0, 0], [0, 1, 0, 0]]) + sequence = FinishAfterTwo(model) + result = sequence("Test", samples=3) + assert_array_equal(result, [[4, 0, 1], [4, 0, 1], [4, 0, 1]]) + + class FinishAfterOne(Sequence): + def __init__(self, model): + super().__init__(model) + + def is_finished(self, token_ids): + return np.array([True, True, True]) + + tokenizer = MockTokenizer({"a": 0, "b": 1, "c": 3, "Test": 4}) + model = MockModel(tokenizer, [[1, 0, 0, 0], [0, 1, 0, 0]]) + sequence = FinishAfterOne(model) + result = sequence("Test", samples=3) + assert_array_equal(result, [[4, 0], [4, 0], [4, 0]]) + + +def test_call_prompt_list_samples(): + class FinishAfterThree(Sequence): + def __init__(self, model): + super().__init__(model) + self.iteration_idx = 0 + + def is_finished(self, token_ids): + if self.iteration_idx == 0: + self.iteration_idx += 1 + batch_shape = token_ids.shape[:-1] + return np.zeros(batch_shape, dtype=np.bool_) + elif self.iteration_idx == 1: + self.iteration_idx += 1 + return np.array( + [[True, False, True], [True, False, True], [True, False, True]] + ) + else: + return np.array([True, True, True]) + + tokenizer = MockTokenizer( + {"a": 0, "b": 1, "c": 2, "Test1": 3, "Test2": 4, "Test3": 5} + ) + model = MockModel( + tokenizer, [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]] + ) + sequence = FinishAfterThree(model) + + result = sequence(["Test1", "Test2", "Test3"], samples=3) + assert_array_equal( + result, np.tile([[3, 0, 1, -1], [4, 0, 1, 2], [5, 0, 1, -1]], (3, 1, 1)) + ) From ed8021b61d9c8ecae05565358a26bba3c0d5bceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Mon, 12 Jun 2023 15:50:28 +0200 Subject: [PATCH 5/5] Add `Continuation` generation model --- outlines/text/__init__.py | 1 + outlines/text/generate/__init__.py | 1 + outlines/text/generate/continuation.py | 52 +++++++++++++++++++ .../text/{sequences => generate}/sequence.py | 4 ++ tests/text/generate/test_continuation.py | 42 +++++++++++++++ .../generate/test_integration_transfomers.py | 24 +++++++++ .../{sequences => generate}/test_sequence.py | 2 +- 7 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 outlines/text/generate/__init__.py create mode 100644 outlines/text/generate/continuation.py rename outlines/text/{sequences => generate}/sequence.py (98%) create mode 100644 tests/text/generate/test_continuation.py create mode 100644 tests/text/generate/test_integration_transfomers.py rename tests/text/{sequences => generate}/test_sequence.py (99%) diff --git a/outlines/text/__init__.py b/outlines/text/__init__.py index 4b187905e..8870c7a1f 100644 --- a/outlines/text/__init__.py +++ b/outlines/text/__init__.py @@ -1,2 +1,3 @@ from .functions import function +from .generate import continuation from .prompts import prompt, render diff --git a/outlines/text/generate/__init__.py b/outlines/text/generate/__init__.py new file mode 100644 index 000000000..3176b9b4a --- /dev/null +++ b/outlines/text/generate/__init__.py @@ -0,0 +1 @@ +from .continuation import continuation diff --git a/outlines/text/generate/continuation.py b/outlines/text/generate/continuation.py new file mode 100644 index 000000000..e616d3f36 --- /dev/null +++ b/outlines/text/generate/continuation.py @@ -0,0 +1,52 @@ +from typing import List, Optional + +import numpy as np +from numpy.typing import NDArray + +from outlines.text.generate.sequence import Sequence + + +class Continuation(Sequence): + """Represents a completion generation model. + + `Completion` instances are unconstrained generation models that stop when an EOS token + has been found or when the maximum number of tokens has been reached. + + >> import outlines.text as text + >> sequence = text.sequence(model)("Say something") + + """ + + def __init__(self, model, max_tokens: Optional[int]): + super().__init__(model, max_tokens) + + def is_finished(self, token_ids: NDArray[np.int64]) -> NDArray[np.bool_]: + """Determine whether the sequences reached maximum length of end with + and EOS token. + + In practice, `Sequence`'s `__call__` methods only passed the `token_ids` + of the sequences that haven't been marked as finished already, which is + why we only need to look for the EOS token in the last element rather + than in the whole sequence. + + Parameters + ---------- + token_ids + The input sequences. + + """ + is_finished = np.zeros((token_ids.shape[0],), dtype=np.bool_) + is_finished[token_ids[:, -1] == self.model.tokenizer.eos_token_id] = True + + return is_finished + + def postprocess_completions(self, completions: List[str]) -> List[str]: + """Remove the EOS token from the completion.""" + return [ + completion.replace(self.model.tokenizer.eos_token, "") + for completion in completions + ] + + +def continuation(model, max_tokens: Optional[int] = None): + return Continuation(model, max_tokens) diff --git a/outlines/text/sequences/sequence.py b/outlines/text/generate/sequence.py similarity index 98% rename from outlines/text/sequences/sequence.py rename to outlines/text/generate/sequence.py index bea23de4c..614297edd 100644 --- a/outlines/text/sequences/sequence.py +++ b/outlines/text/generate/sequence.py @@ -29,6 +29,9 @@ def is_finished(self, token_ids: NDArray[np.int64]) -> NDArray[np.bool_]: "`Sequence.is_finished` must be implemented by subclasses." ) + def postprocess_completions(self, completions: List[str]) -> List[str]: + return completions + def step( self, rng: Generator, @@ -202,6 +205,7 @@ def __call__( is_finished[~is_finished] = self.is_finished(token_ids_unfinished).flatten() result = self.model.tokenizer.decode(token_ids) + result = self.postprocess_completions(result) if len(result) == 1: return result[0] diff --git a/tests/text/generate/test_continuation.py b/tests/text/generate/test_continuation.py new file mode 100644 index 000000000..aaf017491 --- /dev/null +++ b/tests/text/generate/test_continuation.py @@ -0,0 +1,42 @@ +import numpy as np +from numpy.testing import assert_array_equal + +from outlines.text.generate.continuation import Continuation, continuation + + +class Tokenizer: + eos_token = "" + eos_token_id = 0 + pad_token_ids = -1 + + +class Model: + tokenizer = Tokenizer() + + +def test_continuation_is_finished(): + model = continuation(Model(), 10) + assert isinstance(model, Continuation) + + token_ids = np.array([[3, 2]]) + result = model.is_finished(token_ids) + assert_array_equal(result, [False]) + + token_ids = np.array([[3, 2, 0]]) + result = model.is_finished(token_ids) + assert_array_equal(result, [True]) + + token_ids = np.array([[3, 2, 1], [3, 2, 0]]) + result = model.is_finished(token_ids) + assert_array_equal(result, [False, True]) + + token_ids = np.array([[3, 2, 1, 0], [3, 2, 0, -1]]) + result = model.is_finished(token_ids) + assert_array_equal(result, [True, False]) + + +def test_continuation_postprocess(): + model = continuation(Model()) + result = model.postprocess_completions(["Here"]) + assert len(result) == 1 + assert result[0] == "Here" diff --git a/tests/text/generate/test_integration_transfomers.py b/tests/text/generate/test_integration_transfomers.py new file mode 100644 index 000000000..55bbde966 --- /dev/null +++ b/tests/text/generate/test_integration_transfomers.py @@ -0,0 +1,24 @@ +import numpy as np + +import outlines.models as models +from outlines.text.generate.continuation import continuation + + +def test_transformers_integration_completion(): + rng = np.random.default_rng(0) + + model_name = "hf-internal-testing/tiny-random-GPTJForCausalLM" + model = models.transformers(model_name, device="cpu") + sequence = continuation(model)("prompt", rng=rng) + assert isinstance(sequence, str) + assert model.tokenizer.eos_token not in sequence + + sequence = continuation(model, max_tokens=10)("prompt", rng=rng) + assert isinstance(sequence, str) + + +def test_transformers_integration_with_pad_token(): + model_name = "hf-internal-testing/tiny-random-XLMRobertaXLForCausalLM" + model = models.transformers(model_name, device="cpu") + assert model.tokenizer.pad_token_id == 1 + assert model.tokenizer.pad_token == "" diff --git a/tests/text/sequences/test_sequence.py b/tests/text/generate/test_sequence.py similarity index 99% rename from tests/text/sequences/test_sequence.py rename to tests/text/generate/test_sequence.py index 946990102..9659e8d6a 100644 --- a/tests/text/sequences/test_sequence.py +++ b/tests/text/generate/test_sequence.py @@ -4,7 +4,7 @@ import pytest from numpy.testing import assert_array_equal -from outlines.text.sequences.sequence import Sequence, vectorized_random_choice +from outlines.text.generate.sequence import Sequence, vectorized_random_choice def test_vectorized_random_choice():