neuralmagic · dbogunowicz · Jul 5, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 6, 2023
diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -154,10 +154,7 @@ def __call__(
         else:
             logits = out[0]
 
-        B, S, V = logits.shape  # batch, sequence, vocab
-        logits = logits[:, -1, :].reshape(B, 1, V)  # only take the last token
-
-        token = self.generate_token(logits=logits)
+        token = self.generate_token(logits=logits[:, -1, :])
 
         return token, logits
 

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
@@ -69,7 +69,7 @@
 from tqdm.auto import tqdm
 
 from deepsparse import Pipeline
-from deepsparse.transformers.metrics import PrecisionRecallF1
+from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
 
 
 from datasets import load_dataset, load_metric  # isort: skip
@@ -78,6 +78,34 @@
 ORT_ENGINE = "onnxruntime"
 
 
+def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
+    dataset = load_dataset(dataset_name)["test"]
+
+    text_generation = Pipeline.create(
+        task="text-generation",
+        model_path=args.model_path,
+        # TODO: make sure this also works for deepsparse engine
+        engine_type="onnxruntime",
+        num_cores=args.num_cores,
+        sequence_length=args.max_sequence_length,
+        prompt_processing_sequence_length=args.max_sequence_length,
+        max_generated_tokens=1,
+        tokenizer_padding_side="right",
+    )
+    perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size)
+    # TODO: text_generation.engine is None
+    print(f"Engine info: {text_generation.engine}")
+    predictions = []
+    for idx, sample in _enumerate_progress(dataset, args.max_samples):
+        predictions.append(sample["prompt"] + sample["canonical_solution"])
+        if len(predictions) == batch_size:
+            perplexity_metrics.add_batch(predictions)
+            predictions = []
+        if idx == 32:
+            break
+    return perplexity_metrics
+
+
 def qa_eval(args, dataset_name="squad"):
     # load validation dataset and eval tool
     dataset = load_dataset(dataset_name)["validation"]
@@ -443,16 +471,20 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     "imdb": imdb_eval,
     "conll2003": conll2003_eval,
     "go_emotions": go_emotions_eval,
+    "openai_humaneval": perplexity_eval,
 }
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
+        # TODO: Not BERT anymore
         description="Evaluate a BERT ONNX model on a downstream dataset"
     )
     parser.add_argument(
-        "model_path",
+        "-m",
+        "--model_path",
         type=str,
+        default="/home/ubuntu/damian/sparseml/deployment",
         help=(
             "The path to a directory containing model.onnx, config.json, and "
             "tokenizer.json files or SparseZoo stub to the model"
@@ -462,8 +494,7 @@ def parse_args():
         "-d",
         "--dataset",
         type=str,
-        choices=list(SUPPORTED_DATASETS.keys()),
-        required=True,
+        default="openai_humaneval",
     )
     parser.add_argument(
         "-v",
@@ -516,7 +547,7 @@ def parse_args():
         "--max-samples",
         help="the max number of samples to evaluate. Default is None or all samples",
         type=int,
-        default=None,
+        default=32,
     )
 
     parser.add_argument(

diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
@@ -17,18 +17,102 @@
 """
 
 
-from typing import Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import numpy
+from tqdm import tqdm
 
+import torch
+from deepsparse import Pipeline
+from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
 from sklearn.metrics import precision_recall_fscore_support
 
 
 __all__ = [
     "PrecisionRecallF1",
+    "Perplexity",
 ]
 
 
+class Perplexity:
+    def __init__(self, pipeline: Pipeline, batch_size: int = 16):
+        """
+        Given the pipeline, compute the perplexity of the model
+        on the given text input.
+
+        Code adapted from:
+        https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501
+
+        :param pipeline: The pipeline to use for text generation
+        :param batch_size: The batch size to split the input text into
+         non-overlapping batches
+        """
+        if not isinstance(pipeline, TextGenerationPipeline):
+            raise ValueError(
+                "Perplexity can only be computed for text generation pipelines"
+            )
+        self._pipeline = pipeline
+        self._batch_size = batch_size
+        self._sequence_length = pipeline.sequence_length
+        self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+        self.perplexities = []
+
+    def add_batch(self, predictions: List[str]):
+        """
+        Run the model on the given input sequences and compute the perplexity.
+        The resulting perplexity is appended to the list of perplexities.
+
+        :param predictions: The predictions to compute perplexity on
+        """
+        # tokenize the input text
+        encodings = self._pipeline.tokenizer(
+            predictions,
+            return_attention_mask=True,
+            max_length=self._sequence_length,
+            truncation=True,
+            padding="max_length",
+        )
+
+        encoded_texts = encodings["input_ids"]
+        attention_masks = encodings["attention_mask"]
+
+        # split input_text into non-overlapping batches of `batch_size`
+        for start_index in tqdm(range(0, len(encoded_texts), self._batch_size)):
+            end_index = min(start_index + self._batch_size, len(encoded_texts))
+            encoded_batch = encoded_texts[start_index:end_index]
+            attention_mask = attention_masks[start_index:end_index]
+
+            out = self._pipeline(sequences=predictions, return_logits=True)
+            logits = out.logits
+
+            labels = encoded_batch
+
+            # shift logits and labels create the input and target for the loss function
+            shift_logits = logits[:, :-1, :]
+            shift_labels = numpy.stack(labels)[:, 1:]
+            shift_attention_mask_batch = numpy.stack(attention_mask)[:, 1:]
+
+            # compute perplexity for this batch
+            perplexity_batch = torch.exp(
+                (
+                    self._loss_fct(
+                        torch.tensor(shift_logits.transpose(0, 2, 1)),
+                        torch.tensor(shift_labels),
+                    )
+                    * torch.tensor(shift_attention_mask_batch)
+                ).sum(1)
+                / torch.tensor(shift_attention_mask_batch).sum(1)
+            )
+            self.perplexities.extend(perplexity_batch.numpy().tolist())
+
+    def compute(self) -> Dict[str, Any]:
+        return {
+            "mean_perplexity": numpy.mean(self.perplexities),
+            "perplexities": self.perplexities,
+        }
+
+
 class PrecisionRecallF1:
     def __init__(self, id_to_label: Optional[Dict[int, str]] = None):
         self._id_to_label = id_to_label

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -89,6 +89,8 @@ class TextGenerationPipeline(TransformersPipeline):
         of tokens supplied even if the stop token is reached.
     :param use_deepsparse_cache: if True, the pipeline will use the deepsparse kv cache
         for caching the model outputs.
+    :param tokenizer_padding_side: the side to pad the input sequence to.
+        Either "left" or "right". Defaults to "left".
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
 
@@ -101,6 +103,7 @@ def __init__(
         prompt_processing_sequence_length: int = 128,
         force_max_tokens: bool = False,
         use_deepsparse_cache: bool = False,
+        tokenizer_padding_side: str = "left",
         **kwargs,
     ):
         if use_deepsparse_cache:
@@ -126,8 +129,7 @@ def __init__(
         self.prompt_processing_sequence_length = prompt_processing_sequence_length
         self.force_max_tokens = force_max_tokens
 
-        # override tokenizer to pad to left
-        self.tokenizer.padding_side = "left"
+        self.tokenizer.padding_side = tokenizer_padding_side
 
         self.engine = None
         self.multitoken_engine = NLDecoderEngine(
@@ -207,6 +209,8 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             return_tensors="np",
             max_length=self.sequence_length,
             padding="max_length",
+            # TODO: Truncating by default may be a problem
+            truncation=True,
         )
 
         attention_mask = input_tokens["attention_mask"]
@@ -240,7 +244,9 @@ def process_engine_outputs(
         """
         generated_tokens, generated_logits = engine_outputs
         sequences = self.tokenizer.batch_decode(
-            *generated_tokens, skip_special_tokens=True
+            # TODO: hack for now, make it general
+            *generated_tokens[0],
+            skip_special_tokens=True,
         )
         logits = generated_logits if kwargs.get("return_logits") else None