From 48ac0accaa8a346327a96d4d8dde194e28b3be3a Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 5 Jun 2023 15:55:03 +0000
Subject: [PATCH 01/68] initial commit

---
 src/deepsparse/license.py              |  3 +-
 src/deepsparse/server/cli.py           |  1 +
 src/deepsparse/transformers/helpers.py | 12 +++++-
 src/deepsparse/utils/onnx.py           | 55 +++++++++++++++++++-------
 src/deepsparse/yolo/utils/utils.py     | 24 ++++++-----
 5 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/src/deepsparse/license.py b/src/deepsparse/license.py
index ed436aaaf9..f4035072d3 100644
--- a/src/deepsparse/license.py
+++ b/src/deepsparse/license.py
@@ -53,7 +53,7 @@
 def add_deepsparse_license(token_or_path):
     candidate_license_file_path = token_or_path
     if not os.path.exists(token_or_path):
-        # write raw token to temp file for validadation
+        # write raw token to temp file for validation
         candidate_license_tempfile = NamedTemporaryFile()
         candidate_license_file_path = candidate_license_tempfile.name
         with open(candidate_license_file_path, "w") as token_file:
@@ -70,6 +70,7 @@ def add_deepsparse_license(token_or_path):
     license_file_path = _get_license_file_path()
     shutil.copy(candidate_license_file_path, license_file_path)
     _LOGGER.info(f"DeepSparse license file written to {license_file_path}")
+    os.remove(candidate_license_file_path)
 
     # re-validate and print message now that licensee is copied to expected location
     validate_license()
diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py
index 1b323e28e3..29cbc9afb0 100644
--- a/src/deepsparse/server/cli.py
+++ b/src/deepsparse/server/cli.py
@@ -228,6 +228,7 @@ def main(
             loggers={},
         )
 
+        # saving yaml config to temporary directory
         with TemporaryDirectory() as tmp_dir:
             config_path = os.path.join(tmp_dir, "server-config.yaml")
             with open(config_path, "w") as fp:
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d80949eb11..d798231050 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -136,6 +136,7 @@ def overwrite_transformer_onnx_model_inputs(
     batch_size: int = 1,
     max_length: int = 128,
     output_path: Optional[str] = None,
+    inplace: bool = True,
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
     Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
@@ -148,12 +149,21 @@ def overwrite_transformer_onnx_model_inputs(
     :param output_path: if provided, the model will be saved to the given path,
         otherwise, the model will be saved to a named temporary file that will
         be deleted after the program exits
+    :param inplace: if True, the model will be modified in place, otherwise
+        a copy of the model will be saved to a temporary file
     :return: if no output path, a tuple of the saved path to the model, list of
         model input names, and reference to the tempfile object will be returned
         otherwise, only the model input names will be returned
     """
+
+    if inplace and output_path is None:
+        raise ValueError(
+            "Cannot specify both inplace=True and output_path. If inplace=True, "
+            "the model will be modified in place (the returned path will be identical"
+            "to the input path specified in argument `path`)"
+        )
     # overwrite input shapes
-    model = onnx.load(path)
+    model = onnx.load(path, load_external_data=not inplace)
     initializer_input_names = set([node.name for node in model.graph.initializer])
     external_inputs = [
         inp for inp in model.graph.input if inp.name not in initializer_input_names
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 326c4b215d..8b40ab4346 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -24,7 +24,7 @@
 from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
 
 from deepsparse.utils.extractor import Extractor
-from sparsezoo.utils import save_onnx, validate_onnx
+from sparsezoo.utils import onnx_includes_external_data, save_onnx, validate_onnx
 
 
 try:
@@ -53,13 +53,21 @@
 
 
 @contextlib.contextmanager
-def save_onnx_to_temp_files(model: Model, with_external_data=True) -> str:
+def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) -> str:
     """
     Save model to a temporary file. Works for models with external data.
+
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
+    if not onnx_includes_external_data(model) and with_external_data:
+        raise ValueError(
+            "Model does not include external data, it only includes the model graph."
+            "Cannot save its external data to separate a file."
+            "Set argument `with_external_data`=False"
+        )
     shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
+
     if with_external_data:
         external_data = os.path.join(
             tempfile.tempdir, next(tempfile._get_candidate_names())
@@ -195,16 +203,27 @@ def generate_random_inputs(
 
 
 def override_onnx_batch_size(
-    onnx_filepath: str, batch_size: int, inplace: bool = False
+    onnx_filepath: str,
+    batch_size: int,
+    inplace: bool = True,
 ) -> str:
     """
     Rewrite batch sizes of ONNX model, saving the modified model and returning its path
-    :param onnx_filepath: File path to ONNX model
+
+    :param onnx_filepath: File path to ONNX model. If the graph is to be
+        modified in-place, only the model graph will be loaded and modified.
+        Otherwise, the entire model will be loaded and modified, so that
+        external data are saved along the model graph.
     :param batch_size: Override for the batch size dimension
-    :param inplace: If True, overwrite the original model file
-    :return: File path to modified ONNX model
+    :param inplace: If True, overwrite the original model file.
+        Else save the modified model to a temporary file.
+    :return: File path to modified ONNX model.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath, load_external_data=not inplace)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -215,30 +234,38 @@ def override_onnx_batch_size(
 
     # Save modified model, this will be cleaned up when context is exited
     if inplace:
-        onnx.save(model, onnx_filepath)
+        save_onnx(model, onnx_filepath)
         return onnx_filepath
     else:
         # Save modified model, this will be cleaned up when context is exited
-        return save_onnx_to_temp_files(model, with_external_data=False)
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def override_onnx_input_shapes(
     onnx_filepath: str,
     input_shapes: Union[List[int], List[List[int]]],
-    inplace: bool = False,
+    inplace: bool = True,
 ) -> str:
     """
     Rewrite input shapes of ONNX model, saving the modified model and returning its path
-    :param onnx_filepath: File path to ONNX model
+
+    :param onnx_filepath: File path to ONNX model. If the graph is to be
+        modified in-place, only the model graph will be loaded and modified.
+        Otherwise, the entire model will be loaded and modified, so that
+        external data are saved along the model graph.
     :param input_shapes: Override for model's input shapes
     :param inplace: If True, overwrite the original model file
-    :return: File path to modified ONNX model
+    :return: File path to modified ONNX model.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
 
     if input_shapes is None:
         return onnx_filepath
 
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath, load_external_data=not inplace)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -279,7 +306,7 @@ def override_onnx_input_shapes(
         return onnx_filepath
     else:
         # Save modified model, this will be cleaned up when context is exited
-        return save_onnx_to_temp_files(model, with_external_data=False)
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def truncate_onnx_model(
diff --git a/src/deepsparse/yolo/utils/utils.py b/src/deepsparse/yolo/utils/utils.py
index ebbd48233b..3a0f596fe1 100644
--- a/src/deepsparse/yolo/utils/utils.py
+++ b/src/deepsparse/yolo/utils/utils.py
@@ -29,6 +29,7 @@
 import yaml
 
 import torch
+from deepsparse.utils.onnx import save_onnx_to_temp_files
 from deepsparse.yolo.schemas import YOLOOutput
 from sparsezoo.utils import save_onnx
 
@@ -341,7 +342,7 @@ def get_onnx_expected_image_shape(onnx_model: onnx.ModelProto) -> Tuple[int, ...
 
 
 def modify_yolo_onnx_input_shape(
-    model_path: str, image_shape: Tuple[int, int]
+    model_path: str, image_shape: Tuple[int, int], inplace: bool = True
 ) -> Tuple[str, Optional[NamedTemporaryFile]]:
     """
     Creates a new YOLO ONNX model from the given path that accepts the given input
@@ -350,13 +351,17 @@ def modify_yolo_onnx_input_shape(
 
     :param model_path: file path to YOLO ONNX model
     :param image_shape: 2-tuple of the image shape to resize this yolo model to
-    :return: filepath to an onnx model reshaped to the given input shape will be the
-        original path if the shape is the same.  Additionally returns the
-        NamedTemporaryFile for managing the scope of the object for file deletion
+    :param inplace: if True, modifies the given model_path in-place, otherwise
+        saves the modified model to a temporary file
+    :return: filepath to an onnx model reshaped to the given input shape.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
     has_postprocessing = yolo_onnx_has_postprocessing(model_path)
 
-    model = onnx.load(model_path)
+    model = onnx.load(model_path, load_external_data=not inplace)
     model_input = model.graph.input[0]
 
     initial_x, initial_y = get_onnx_expected_image_shape(model)
@@ -399,10 +404,11 @@ def modify_yolo_onnx_input_shape(
         )
         set_tensor_dim_shape(model.graph.output[0], 1, num_predictions)
 
-    tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-    save_onnx(model, tmp_file.name)
-
-    return tmp_file.name, tmp_file
+    if inplace:
+        save_onnx(model, model_path)
+        return model_path
+    else:
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def get_tensor_dim_shape(tensor: onnx.TensorProto, dim: int) -> int:

From cf7f2b92c38a08cd34974931a9520e55e088d8cb Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 5 Jun 2023 17:59:30 +0200
Subject: [PATCH 02/68] Update src/deepsparse/license.py

---
 src/deepsparse/license.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/license.py b/src/deepsparse/license.py
index f4035072d3..06acdd2f0c 100644
--- a/src/deepsparse/license.py
+++ b/src/deepsparse/license.py
@@ -70,7 +70,6 @@ def add_deepsparse_license(token_or_path):
     license_file_path = _get_license_file_path()
     shutil.copy(candidate_license_file_path, license_file_path)
     _LOGGER.info(f"DeepSparse license file written to {license_file_path}")
-    os.remove(candidate_license_file_path)
 
     # re-validate and print message now that licensee is copied to expected location
     validate_license()

From e6d2b0326bb0a7a5e8085a4ec779ce5ef4bd8859 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 7 Jun 2023 13:14:41 +0000
Subject: [PATCH 03/68] limit to 150mb

---
 src/deepsparse/transformers/helpers.py        |   8 +-
 src/deepsparse/utils/onnx.py                  |   3 +
 tests/conftest.py                             |  35 ++
 .../helpers/test_config_generation.py         |   3 +
 .../loggers/test_prometheus_logger.py         |   3 +
 tests/server/test_app.py                      | 332 +++++------
 tests/server/test_config.py                   | 444 +++++++--------
 tests/server/test_endpoints.py                | 536 +++++++++---------
 tests/server/test_loggers.py                  | 486 ++++++++--------
 tests/server/test_system_logging.py           | 338 +++++------
 10 files changed, 1118 insertions(+), 1070 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d798231050..847a7a9924 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -156,12 +156,14 @@ def overwrite_transformer_onnx_model_inputs(
         otherwise, only the model input names will be returned
     """
 
-    if inplace and output_path is None:
+    if inplace and output_path is not None:
         raise ValueError(
             "Cannot specify both inplace=True and output_path. If inplace=True, "
             "the model will be modified in place (the returned path will be identical"
             "to the input path specified in argument `path`)"
         )
+    if inplace:
+        output_path = path
     # overwrite input shapes
     model = onnx.load(path, load_external_data=not inplace)
     initializer_input_names = set([node.name for node in model.graph.initializer])
@@ -175,14 +177,14 @@ def overwrite_transformer_onnx_model_inputs(
         input_names.append(external_input.name)
 
     # Save modified model
-    if output_path is None:
+    if not inplace:
         tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
         save_onnx(model, tmp_file.name)
         return tmp_file.name, input_names, tmp_file
     else:
         save_onnx(model, output_path)
 
-        return input_names
+        return output_path, input_names, None
 
 
 def _get_file_parent(file_path: str) -> str:
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 8b40ab4346..00f5f24233 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -60,6 +60,7 @@ def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) ->
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
+
     if not onnx_includes_external_data(model) and with_external_data:
         raise ValueError(
             "Model does not include external data, it only includes the model graph."
@@ -67,6 +68,7 @@ def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) ->
             "Set argument `with_external_data`=False"
         )
     shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
+    _LOGGER.warning(f"Saving model to temporary directory: {tempfile.tempdir}")
 
     if with_external_data:
         external_data = os.path.join(
@@ -385,6 +387,7 @@ def truncate_onnx_model(
             output.type.tensor_type.shape.Clear()
 
     # save and check model
+    _LOGGER.info("Saving truncated model to %s", output_filepath)
     save_onnx(extracted_model, output_filepath, "external_data")
     validate_onnx(output_filepath)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 323c0b703e..62f781f043 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import tempfile
 from subprocess import Popen
 from typing import List
 
@@ -20,6 +21,14 @@
 from tests.helpers import delete_file
 
 
+def _get_files(directory: str) -> List[str]:
+    list_filepaths = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            list_filepaths.append(os.path.join(os.path.abspath(root), file))
+    return list_filepaths
+
+
 @pytest.fixture
 def cleanup():
     filenames: List[str] = []
@@ -50,3 +59,29 @@ def cleanup():
         )
         for proc in processes:
             proc.terminate()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def check_for_created_files():
+    start_files_root = _get_files(directory=r".")
+    start_files_temp = _get_files(directory=tempfile.gettempdir())
+    yield
+    end_files_root = _get_files(directory=r".")
+    end_files_temp = _get_files(directory=tempfile.gettempdir())
+
+    assert len(start_files_root) >= len(end_files_root), (
+        f"{len(end_files_root) - len(start_files_root)} "
+        f"files created in current working "
+        f"directory during pytest run. "
+        f"Created files: {set(end_files_root) - set(start_files_root)}"
+    )
+    max_allowed_sized_temp_files_megabytes = 150
+    size_of_temp_files_bytes = sum(
+        os.path.getsize(path) for path in set(end_files_temp) - set(start_files_temp)
+    )
+    size_of_temp_files_megabytes = size_of_temp_files_bytes / 1024 / 1024
+    assert max_allowed_sized_temp_files_megabytes >= size_of_temp_files_megabytes, (
+        f"{size_of_temp_files_megabytes} "
+        f"megabytes of temp files created in temp directory during pytest run. "
+        f"Created files: {set(end_files_temp) - set(start_files_temp)}"
+    )
diff --git a/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py b/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py
index 9350f22c6e..7cf6ad0c07 100644
--- a/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py
+++ b/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py
@@ -14,6 +14,7 @@
 
 
 import os
+import shutil
 
 import yaml
 
@@ -155,6 +156,8 @@ def test_data_logging_config_from_predefined(
         with open(os.path.join(tmp_path, "data_logging_config.yaml"), "r") as stream:
             string_result_saved = yaml.safe_load(stream)
         assert string_result_saved == yaml.safe_load(expected_result)
+        return
+    shutil.rmtree(tmp_path, ignore_errors=True)
 
 
 result_1 = """loggers:
diff --git a/tests/deepsparse/loggers/test_prometheus_logger.py b/tests/deepsparse/loggers/test_prometheus_logger.py
index e2935cfb62..689b5163af 100644
--- a/tests/deepsparse/loggers/test_prometheus_logger.py
+++ b/tests/deepsparse/loggers/test_prometheus_logger.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+import shutil
+
 import requests
 
 import pytest
@@ -119,6 +121,7 @@ def test_prometheus_logger(
     count_request_text = float(text_log_lines[98].split(" ")[1])
 
     assert count_request_request == count_request_text == no_iterations
+    shutil.rmtree(tmp_path)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/server/test_app.py b/tests/server/test_app.py
index 9bc71e1a36..678152adc9 100644
--- a/tests/server/test_app.py
+++ b/tests/server/test_app.py
@@ -1,166 +1,166 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from copy import deepcopy
-from re import escape
-from unittest.mock import patch
-
-import pytest
-from deepsparse.server.config import EndpointConfig, ServerConfig
-from deepsparse.server.server import _build_app
-
-
-def test_add_multiple_endpoints_with_no_route():
-    with pytest.raises(
-        ValueError,
-        match=(
-            "must specify `route` for all endpoints if multiple endpoints are used."
-        ),
-    ):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                endpoints=[
-                    EndpointConfig(task="", model="", route=None),
-                    EndpointConfig(task="", model="", route=None),
-                ],
-                loggers={},
-            )
-        )
-
-
-def test_add_multiple_endpoints_with_same_route():
-    with pytest.raises(ValueError, match="asdf specified 2 times"):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                endpoints=[
-                    EndpointConfig(task="", model="", route="asdf"),
-                    EndpointConfig(task="", model="", route="asdf"),
-                ],
-                loggers={},
-            )
-        )
-
-
-def test_invalid_integration():
-    with pytest.raises(
-        ValueError,
-        match=escape(
-            "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
-        ),
-    ):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                integration="asdf",
-                endpoints=[],
-                loggers={},
-            )
-        )
-
-
-def test_pytorch_num_threads():
-    torch = pytest.importorskip("torch")
-
-    orig_num_threads = torch.get_num_threads()
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            pytorch_num_threads=None,
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert torch.get_num_threads() == orig_num_threads
-
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            pytorch_num_threads=1,
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert torch.get_num_threads() == 1
-
-
-@patch.dict(os.environ, deepcopy(os.environ))
-def test_thread_pinning_none():
-    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            engine_thread_pinning="none",
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-
-
-@patch.dict(os.environ, deepcopy(os.environ))
-def test_thread_pinning_numa():
-    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            engine_thread_pinning="numa",
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
-
-
-@patch.dict(os.environ, deepcopy(os.environ))
-def test_thread_pinning_cores():
-    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            engine_thread_pinning="core",
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
-    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-
-
-def test_invalid_thread_pinning():
-    with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                engine_thread_pinning="asdf",
-                endpoints=[],
-                loggers={},
-            )
-        )
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# import os
+# from copy import deepcopy
+# from re import escape
+# from unittest.mock import patch
+#
+# import pytest
+# from deepsparse.server.config import EndpointConfig, ServerConfig
+# from deepsparse.server.server import _build_app
+#
+#
+# def test_add_multiple_endpoints_with_no_route():
+#     with pytest.raises(
+#         ValueError,
+#         match=(
+#             "must specify `route` for all endpoints if multiple endpoints are used."
+#         ),
+#     ):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 endpoints=[
+#                     EndpointConfig(task="", model="", route=None),
+#                     EndpointConfig(task="", model="", route=None),
+#                 ],
+#                 loggers={},
+#             )
+#         )
+#
+#
+# def test_add_multiple_endpoints_with_same_route():
+#     with pytest.raises(ValueError, match="asdf specified 2 times"):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 endpoints=[
+#                     EndpointConfig(task="", model="", route="asdf"),
+#                     EndpointConfig(task="", model="", route="asdf"),
+#                 ],
+#                 loggers={},
+#             )
+#         )
+#
+#
+# def test_invalid_integration():
+#     with pytest.raises(
+#         ValueError,
+#         match=escape(
+#             "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
+#         ),
+#     ):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 integration="asdf",
+#                 endpoints=[],
+#                 loggers={},
+#             )
+#         )
+#
+#
+# def test_pytorch_num_threads():
+#     torch = pytest.importorskip("torch")
+#
+#     orig_num_threads = torch.get_num_threads()
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             pytorch_num_threads=None,
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert torch.get_num_threads() == orig_num_threads
+#
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             pytorch_num_threads=1,
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert torch.get_num_threads() == 1
+#
+#
+# @patch.dict(os.environ, deepcopy(os.environ))
+# def test_thread_pinning_none():
+#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             engine_thread_pinning="none",
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+#
+#
+# @patch.dict(os.environ, deepcopy(os.environ))
+# def test_thread_pinning_numa():
+#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             engine_thread_pinning="numa",
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
+#
+#
+# @patch.dict(os.environ, deepcopy(os.environ))
+# def test_thread_pinning_cores():
+#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             engine_thread_pinning="core",
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
+#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+#
+#
+# def test_invalid_thread_pinning():
+#     with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 engine_thread_pinning="asdf",
+#                 endpoints=[],
+#                 loggers={},
+#             )
+#         )
diff --git a/tests/server/test_config.py b/tests/server/test_config.py
index b1c1c75a84..f2f9b0e6fe 100644
--- a/tests/server/test_config.py
+++ b/tests/server/test_config.py
@@ -1,222 +1,222 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import yaml
-
-import pytest
-from deepsparse.server.config import (
-    EndpointConfig,
-    ImageSizesConfig,
-    MetricFunctionConfig,
-    SequenceLengthsConfig,
-    ServerConfig,
-)
-
-
-def test_no_bucketing_config():
-    cfg = EndpointConfig(task="", model="").to_pipeline_config()
-    assert cfg.input_shapes is None
-    assert cfg.kwargs == {}
-
-
-@pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
-def test_bucketing_sequence_length_for_cv(task):
-    with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
-        EndpointConfig(
-            task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-        ).to_pipeline_config()
-
-
-@pytest.mark.parametrize(
-    "task", ["question_answering", "text_classification", "token_classification"]
-)
-def test_bucketing_image_size_for_nlp(task):
-    with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
-        EndpointConfig(
-            task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
-        ).to_pipeline_config()
-
-
-def test_bucketing_zero_sequence_length():
-    with pytest.raises(ValueError, match="at least one sequence length"):
-        EndpointConfig(
-            task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-        ).to_pipeline_config()
-
-
-def test_bucketing_zero_image_size():
-    with pytest.raises(ValueError, match="at least one image size"):
-        EndpointConfig(
-            task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
-        ).to_pipeline_config()
-
-
-def test_bucketing_one_sequence_length():
-    cfg = EndpointConfig(
-        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
-    ).to_pipeline_config()
-    assert cfg.input_shapes is None
-    assert cfg.kwargs == {"sequence_length": 32}
-
-
-def test_bucketing_multi_sequence_length():
-    cfg = EndpointConfig(
-        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
-    ).to_pipeline_config()
-    assert cfg.input_shapes is None
-    assert cfg.kwargs == {"sequence_length": [32, 64]}
-
-
-def test_bucketing_one_image_size():
-    cfg = EndpointConfig(
-        task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
-    ).to_pipeline_config()
-    assert cfg.input_shapes == [[256, 256]]
-    assert cfg.kwargs == {}
-
-
-def test_endpoint_config_to_pipeline_copy_fields():
-    cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
-    assert cfg.task == "qa"
-    assert cfg.model_path == "zxcv"
-
-    cfg = EndpointConfig(task="", model="").to_pipeline_config()
-    assert cfg.batch_size == 1
-
-    cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
-    assert cfg.batch_size == 64
-
-
-def test_yaml_load_config(tmp_path):
-    server_config = ServerConfig(
-        num_cores=1,
-        num_workers=2,
-        integration="sagemaker",
-        endpoints=[
-            EndpointConfig(
-                name="asdf",
-                route="qwer",
-                task="uiop",
-                model="hjkl",
-                batch_size=1,
-                bucketing=None,
-            ),
-            EndpointConfig(
-                name="asdfd",
-                route="qwer",
-                task="uiop",
-                model="hjkl",
-                batch_size=2,
-                bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
-            ),
-            EndpointConfig(
-                name="asdfde",
-                route="qwer",
-                task="uiop",
-                model="hjkl",
-                batch_size=3,
-                bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
-            ),
-        ],
-        loggers={},
-    )
-
-    path = tmp_path / "config.yaml"
-    with open(path, "w") as fp:
-        yaml.dump(server_config.dict(), fp)
-
-    with open(path) as fp:
-        obj = yaml.load(fp, Loader=yaml.Loader)
-    server_config2 = ServerConfig(**obj)
-    assert server_config == server_config2
-
-
-metric_function_config_yaml_1 = """
-  func: identity
-  frequency: 5
-  loggers:
-    - python"""
-
-metric_function_config_yaml_2 = """
-  func: numpy.max"""
-
-metric_function_config_yaml_3 = """
-  func: numpy.max
-  frequency: 0"""
-
-
-@pytest.mark.parametrize(
-    "config_yaml, should_fail, instance_type",
-    [
-        (metric_function_config_yaml_1, False, MetricFunctionConfig),
-        (metric_function_config_yaml_2, False, MetricFunctionConfig),
-        (
-            metric_function_config_yaml_3,
-            True,
-            MetricFunctionConfig,
-        ),  # frequency cannot be zero
-    ],
-)
-def test_function_logging_config(config_yaml, should_fail, instance_type):
-    obj = yaml.safe_load(config_yaml)
-    if should_fail:
-        with pytest.raises(Exception):
-            MetricFunctionConfig(**obj)
-    else:
-        assert MetricFunctionConfig(**obj)
-
-
-def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
-    return ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                name=endpoint_1_name,
-                task=task_name,
-                model="hjkl",
-            ),
-            EndpointConfig(
-                name=endpoint_2_name,
-                task=task_name,
-                model="hjkl",
-            ),
-        ]
-    )
-
-
-@pytest.mark.parametrize(
-    "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
-    [
-        ("some_task", None, None, False, "some_task-0", "some_task-1"),
-        ("some_task", "name_1", None, False, "name_1", "some_task-0"),
-        ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
-        ("some_task", "name_1", "name_1", True, None, None),
-    ],
-)
-def test_unique_endpoint_names(
-    task_name,
-    endpoint_1_name,
-    endpoint_2_name,
-    raise_error,
-    expected_endpoint_1_name,
-    expected_endpoint_2_name,
-):
-    if raise_error:
-        with pytest.raises(ValueError):
-            _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-            return
-        return
-
-    server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-    assert server_config.endpoints[0].name == expected_endpoint_1_name
-    assert server_config.endpoints[1].name == expected_endpoint_2_name
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# import yaml
+#
+# import pytest
+# from deepsparse.server.config import (
+#     EndpointConfig,
+#     ImageSizesConfig,
+#     MetricFunctionConfig,
+#     SequenceLengthsConfig,
+#     ServerConfig,
+# )
+#
+#
+# def test_no_bucketing_config():
+#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
+#     assert cfg.input_shapes is None
+#     assert cfg.kwargs == {}
+#
+#
+# @pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
+# def test_bucketing_sequence_length_for_cv(task):
+#     with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
+#         EndpointConfig(
+#             task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+#         ).to_pipeline_config()
+#
+#
+# @pytest.mark.parametrize(
+#     "task", ["question_answering", "text_classification", "token_classification"]
+# )
+# def test_bucketing_image_size_for_nlp(task):
+#     with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
+#         EndpointConfig(
+#             task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
+#         ).to_pipeline_config()
+#
+#
+# def test_bucketing_zero_sequence_length():
+#     with pytest.raises(ValueError, match="at least one sequence length"):
+#         EndpointConfig(
+#             task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+#         ).to_pipeline_config()
+#
+#
+# def test_bucketing_zero_image_size():
+#     with pytest.raises(ValueError, match="at least one image size"):
+#         EndpointConfig(
+#             task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
+#         ).to_pipeline_config()
+#
+#
+# def test_bucketing_one_sequence_length():
+#     cfg = EndpointConfig(
+#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
+#     ).to_pipeline_config()
+#     assert cfg.input_shapes is None
+#     assert cfg.kwargs == {"sequence_length": 32}
+#
+#
+# def test_bucketing_multi_sequence_length():
+#     cfg = EndpointConfig(
+#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
+#     ).to_pipeline_config()
+#     assert cfg.input_shapes is None
+#     assert cfg.kwargs == {"sequence_length": [32, 64]}
+#
+#
+# def test_bucketing_one_image_size():
+#     cfg = EndpointConfig(
+#         task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
+#     ).to_pipeline_config()
+#     assert cfg.input_shapes == [[256, 256]]
+#     assert cfg.kwargs == {}
+#
+#
+# def test_endpoint_config_to_pipeline_copy_fields():
+#     cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
+#     assert cfg.task == "qa"
+#     assert cfg.model_path == "zxcv"
+#
+#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
+#     assert cfg.batch_size == 1
+#
+#     cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
+#     assert cfg.batch_size == 64
+#
+#
+# def test_yaml_load_config(tmp_path):
+#     server_config = ServerConfig(
+#         num_cores=1,
+#         num_workers=2,
+#         integration="sagemaker",
+#         endpoints=[
+#             EndpointConfig(
+#                 name="asdf",
+#                 route="qwer",
+#                 task="uiop",
+#                 model="hjkl",
+#                 batch_size=1,
+#                 bucketing=None,
+#             ),
+#             EndpointConfig(
+#                 name="asdfd",
+#                 route="qwer",
+#                 task="uiop",
+#                 model="hjkl",
+#                 batch_size=2,
+#                 bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
+#             ),
+#             EndpointConfig(
+#                 name="asdfde",
+#                 route="qwer",
+#                 task="uiop",
+#                 model="hjkl",
+#                 batch_size=3,
+#                 bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
+#             ),
+#         ],
+#         loggers={},
+#     )
+#
+#     path = tmp_path / "config.yaml"
+#     with open(path, "w") as fp:
+#         yaml.dump(server_config.dict(), fp)
+#
+#     with open(path) as fp:
+#         obj = yaml.load(fp, Loader=yaml.Loader)
+#     server_config2 = ServerConfig(**obj)
+#     assert server_config == server_config2
+#
+#
+# metric_function_config_yaml_1 = """
+#   func: identity
+#   frequency: 5
+#   loggers:
+#     - python"""
+#
+# metric_function_config_yaml_2 = """
+#   func: numpy.max"""
+#
+# metric_function_config_yaml_3 = """
+#   func: numpy.max
+#   frequency: 0"""
+#
+#
+# @pytest.mark.parametrize(
+#     "config_yaml, should_fail, instance_type",
+#     [
+#         (metric_function_config_yaml_1, False, MetricFunctionConfig),
+#         (metric_function_config_yaml_2, False, MetricFunctionConfig),
+#         (
+#             metric_function_config_yaml_3,
+#             True,
+#             MetricFunctionConfig,
+#         ),  # frequency cannot be zero
+#     ],
+# )
+# def test_function_logging_config(config_yaml, should_fail, instance_type):
+#     obj = yaml.safe_load(config_yaml)
+#     if should_fail:
+#         with pytest.raises(Exception):
+#             MetricFunctionConfig(**obj)
+#     else:
+#         assert MetricFunctionConfig(**obj)
+#
+#
+# def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
+#     return ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 name=endpoint_1_name,
+#                 task=task_name,
+#                 model="hjkl",
+#             ),
+#             EndpointConfig(
+#                 name=endpoint_2_name,
+#                 task=task_name,
+#                 model="hjkl",
+#             ),
+#         ]
+#     )
+#
+#
+# @pytest.mark.parametrize(
+#     "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
+#     [
+#         ("some_task", None, None, False, "some_task-0", "some_task-1"),
+#         ("some_task", "name_1", None, False, "name_1", "some_task-0"),
+#         ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
+#         ("some_task", "name_1", "name_1", True, None, None),
+#     ],
+# )
+# def test_unique_endpoint_names(
+#     task_name,
+#     endpoint_1_name,
+#     endpoint_2_name,
+#     raise_error,
+#     expected_endpoint_1_name,
+#     expected_endpoint_2_name,
+# ):
+#     if raise_error:
+#         with pytest.raises(ValueError):
+#             _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+#             return
+#         return
+#
+#     server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+#     assert server_config.endpoints[0].name == expected_endpoint_1_name
+#     assert server_config.endpoints[1].name == expected_endpoint_2_name
diff --git a/tests/server/test_endpoints.py b/tests/server/test_endpoints.py
index f028b37e75..411fb46446 100644
--- a/tests/server/test_endpoints.py
+++ b/tests/server/test_endpoints.py
@@ -1,268 +1,268 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List
-from unittest.mock import Mock
-
-from pydantic import BaseModel
-
-import pytest
-from deepsparse.loggers import MultiLogger
-from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
-from deepsparse.server.server import _add_pipeline_endpoint, _build_app
-from fastapi import FastAPI, UploadFile
-from fastapi.testclient import TestClient
-from tests.utils import mock_engine
-
-
-class FromFilesSchema(BaseModel):
-    def from_files(self, f):
-        # do nothing - this method exists just to test files endpoint logic
-        ...
-
-
-class StrSchema(BaseModel):
-    value: str
-
-
-def parse(v: StrSchema) -> int:
-    return int(v.value)
-
-
-class TestStatusEndpoints:
-    @pytest.fixture(scope="class")
-    def server_config(self):
-        server_config = ServerConfig(
-            num_cores=1, num_workers=1, endpoints=[], loggers={}
-        )
-        yield server_config
-
-    @pytest.fixture(scope="class")
-    def client(self, server_config):
-        yield TestClient(_build_app(server_config))
-
-    def test_config(self, server_config, client):
-        response = client.get("/config")
-        loaded = ServerConfig(**response.json())
-        assert loaded == server_config
-
-    @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
-    def test_pings_exist(self, client, route):
-        response = client.get(route)
-        assert response.status_code == 200
-        assert response.json() is True
-
-    def test_docs_exist(self, client):
-        assert client.get("/docs").status_code == 200
-
-    def test_home_redirects_to_docs(self, client):
-        response = client.get("/")
-        assert response.status_code == 200
-        assert response.request.path_url == "/docs"
-        assert len(response.history) > 0
-        assert response.history[-1].is_redirect
-
-
-class TestMockEndpoints:
-    @pytest.fixture(scope="class")
-    def server_config(self):
-        server_config = ServerConfig(
-            num_cores=1, num_workers=1, endpoints=[], loggers={}
-        )
-        yield server_config
-
-    @pytest.fixture(scope="class")
-    def app(self, server_config):
-        yield _build_app(server_config)
-
-    @pytest.fixture(scope="class")
-    def client(self, app):
-        yield TestClient(app)
-
-    def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
-        mock_pipeline = Mock(
-            side_effect=parse,
-            input_schema=StrSchema,
-            output_schema=int,
-            logger=MultiLogger([]),
-        )
-        _add_pipeline_endpoint(
-            app,
-            system_logging_config=SystemLoggingConfig(),
-            endpoint_config=Mock(route="/predict/parse_int"),
-            pipeline=mock_pipeline,
-        )
-        assert app.routes[-1].path == "/predict/parse_int"
-        assert app.routes[-1].response_model is int
-        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-        assert app.routes[-1].methods == {"POST"}
-
-        for v in ["1234", "5678"]:
-            response = client.post("/predict/parse_int", json=dict(value=v))
-            assert response.status_code == 200
-            assert response.json() == int(v)
-
-    def test_add_model_endpoint_with_from_files(self, app):
-        _add_pipeline_endpoint(
-            app,
-            system_logging_config=Mock(),
-            endpoint_config=Mock(route="/predict/parse_int"),
-            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-        )
-        assert app.routes[-2].path == "/predict/parse_int"
-        assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
-        assert app.routes[-1].path == "/predict/parse_int/from_files"
-        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-        assert app.routes[-1].response_model is int
-        assert app.routes[-1].methods == {"POST"}
-
-    def test_sagemaker_only_adds_one_endpoint(self, app):
-        num_routes = len(app.routes)
-        _add_pipeline_endpoint(
-            app,
-            endpoint_config=Mock(route="/predict/parse_int"),
-            system_logging_config=Mock(),
-            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-            integration="sagemaker",
-        )
-        assert len(app.routes) == num_routes + 1
-        assert app.routes[-1].path == "/invocations"
-        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-
-        num_routes = len(app.routes)
-        _add_pipeline_endpoint(
-            app,
-            endpoint_config=Mock(route="/predict/parse_int"),
-            system_logging_config=Mock(),
-            pipeline=Mock(input_schema=StrSchema, output_schema=int),
-            integration="sagemaker",
-        )
-        assert len(app.routes) == num_routes + 1
-        assert app.routes[-1].path == "/invocations"
-        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-
-    def test_add_endpoint_with_no_route_specified(self, app):
-        _add_pipeline_endpoint(
-            app,
-            endpoint_config=Mock(route=None),
-            system_logging_config=Mock(),
-            pipeline=Mock(input_schema=StrSchema, output_schema=int),
-        )
-        assert app.routes[-1].path == "/predict"
-
-
-class TestActualModelEndpoints:
-    @pytest.fixture(scope="class")
-    def client(self):
-        stub = (
-            "zoo:nlp/text_classification/distilbert-none/"
-            "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
-        )
-        server_config = ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            endpoints=[
-                EndpointConfig(
-                    route="/predict/dynamic-batch",
-                    task="text-classification",
-                    model=stub,
-                    batch_size=1,
-                ),
-                EndpointConfig(
-                    route="/predict/static-batch",
-                    task="text-classification",
-                    model=stub,
-                    batch_size=2,
-                ),
-            ],
-            loggers={},  # do not instantiate any loggers
-        )
-        with mock_engine(rng_seed=0):
-            app = _build_app(server_config)
-        yield TestClient(app)
-
-    def test_static_batch_errors_on_wrong_batch_size(self, client):
-        with pytest.raises(
-            RuntimeError,
-            match=(
-                "batch size of 1 passed into pipeline is "
-                "not divisible by model batch size of 2"
-            ),
-        ):
-            client.post("/predict/static-batch", json={"sequences": "today is great"})
-
-    def test_static_batch_good_request(self, client):
-        response = client.post(
-            "/predict/static-batch",
-            json={"sequences": ["today is great", "today is terrible"]},
-        )
-        assert response.status_code == 200
-        output = response.json()
-        assert len(output["labels"]) == 2
-        assert len(output["scores"]) == 2
-
-    @pytest.mark.parametrize(
-        "seqs",
-        [
-            ["today is great"],
-            ["today is great", "today is terrible"],
-            ["the first sentence", "the second sentence", "the third sentence"],
-        ],
-    )
-    def test_dynamic_batch_any(self, client, seqs):
-        response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
-        assert response.status_code == 200
-        output = response.json()
-        assert len(output["labels"]) == len(seqs)
-        assert len(output["scores"]) == len(seqs)
-
-
-class TestDynamicEndpoints:
-    @pytest.fixture(scope="class")
-    def client(self):
-        server_config = ServerConfig(
-            num_cores=1, num_workers=1, endpoints=[], loggers=None
-        )
-        with mock_engine(rng_seed=0):
-            app = _build_app(server_config)
-            yield TestClient(app)
-
-
-@mock_engine(rng_seed=0)
-def test_dynamic_add_and_remove_endpoint(engine_mock):
-    server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
-    app = _build_app(server_config)
-    client = TestClient(app)
-
-    # assert /predict doesn't exist
-    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
-
-    # add /predict
-    response = client.post(
-        "/endpoints",
-        json=EndpointConfig(task="text-classification", model="default").dict(),
-    )
-    assert response.status_code == 200
-    response = client.post("/predict", json=dict(sequences="asdf"))
-    assert response.status_code == 200
-
-    # remove /predict
-    response = client.delete(
-        "/endpoints",
-        json=EndpointConfig(
-            route="/predict", task="text-classification", model="default"
-        ).dict(),
-    )
-    assert response.status_code == 200
-    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# from typing import List
+# from unittest.mock import Mock
+#
+# from pydantic import BaseModel
+#
+# import pytest
+# from deepsparse.loggers import MultiLogger
+# from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
+# from deepsparse.server.server import _add_pipeline_endpoint, _build_app
+# from fastapi import FastAPI, UploadFile
+# from fastapi.testclient import TestClient
+# from tests.utils import mock_engine
+#
+#
+# class FromFilesSchema(BaseModel):
+#     def from_files(self, f):
+#         # do nothing - this method exists just to test files endpoint logic
+#         ...
+#
+#
+# class StrSchema(BaseModel):
+#     value: str
+#
+#
+# def parse(v: StrSchema) -> int:
+#     return int(v.value)
+#
+#
+# class TestStatusEndpoints:
+#     @pytest.fixture(scope="class")
+#     def server_config(self):
+#         server_config = ServerConfig(
+#             num_cores=1, num_workers=1, endpoints=[], loggers={}
+#         )
+#         yield server_config
+#
+#     @pytest.fixture(scope="class")
+#     def client(self, server_config):
+#         yield TestClient(_build_app(server_config))
+#
+#     def test_config(self, server_config, client):
+#         response = client.get("/config")
+#         loaded = ServerConfig(**response.json())
+#         assert loaded == server_config
+#
+#     @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
+#     def test_pings_exist(self, client, route):
+#         response = client.get(route)
+#         assert response.status_code == 200
+#         assert response.json() is True
+#
+#     def test_docs_exist(self, client):
+#         assert client.get("/docs").status_code == 200
+#
+#     def test_home_redirects_to_docs(self, client):
+#         response = client.get("/")
+#         assert response.status_code == 200
+#         assert response.request.path_url == "/docs"
+#         assert len(response.history) > 0
+#         assert response.history[-1].is_redirect
+#
+#
+# class TestMockEndpoints:
+#     @pytest.fixture(scope="class")
+#     def server_config(self):
+#         server_config = ServerConfig(
+#             num_cores=1, num_workers=1, endpoints=[], loggers={}
+#         )
+#         yield server_config
+#
+#     @pytest.fixture(scope="class")
+#     def app(self, server_config):
+#         yield _build_app(server_config)
+#
+#     @pytest.fixture(scope="class")
+#     def client(self, app):
+#         yield TestClient(app)
+#
+#     def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
+#         mock_pipeline = Mock(
+#             side_effect=parse,
+#             input_schema=StrSchema,
+#             output_schema=int,
+#             logger=MultiLogger([]),
+#         )
+#         _add_pipeline_endpoint(
+#             app,
+#             system_logging_config=SystemLoggingConfig(),
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             pipeline=mock_pipeline,
+#         )
+#         assert app.routes[-1].path == "/predict/parse_int"
+#         assert app.routes[-1].response_model is int
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+#         assert app.routes[-1].methods == {"POST"}
+#
+#         for v in ["1234", "5678"]:
+#             response = client.post("/predict/parse_int", json=dict(value=v))
+#             assert response.status_code == 200
+#             assert response.json() == int(v)
+#
+#     def test_add_model_endpoint_with_from_files(self, app):
+#         _add_pipeline_endpoint(
+#             app,
+#             system_logging_config=Mock(),
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+#         )
+#         assert app.routes[-2].path == "/predict/parse_int"
+#         assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
+#         assert app.routes[-1].path == "/predict/parse_int/from_files"
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+#         assert app.routes[-1].response_model is int
+#         assert app.routes[-1].methods == {"POST"}
+#
+#     def test_sagemaker_only_adds_one_endpoint(self, app):
+#         num_routes = len(app.routes)
+#         _add_pipeline_endpoint(
+#             app,
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             system_logging_config=Mock(),
+#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+#             integration="sagemaker",
+#         )
+#         assert len(app.routes) == num_routes + 1
+#         assert app.routes[-1].path == "/invocations"
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+#
+#         num_routes = len(app.routes)
+#         _add_pipeline_endpoint(
+#             app,
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             system_logging_config=Mock(),
+#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
+#             integration="sagemaker",
+#         )
+#         assert len(app.routes) == num_routes + 1
+#         assert app.routes[-1].path == "/invocations"
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+#
+#     def test_add_endpoint_with_no_route_specified(self, app):
+#         _add_pipeline_endpoint(
+#             app,
+#             endpoint_config=Mock(route=None),
+#             system_logging_config=Mock(),
+#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
+#         )
+#         assert app.routes[-1].path == "/predict"
+#
+#
+# class TestActualModelEndpoints:
+#     @pytest.fixture(scope="class")
+#     def client(self):
+#         stub = (
+#             "zoo:nlp/text_classification/distilbert-none/"
+#             "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
+#         )
+#         server_config = ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             endpoints=[
+#                 EndpointConfig(
+#                     route="/predict/dynamic-batch",
+#                     task="text-classification",
+#                     model=stub,
+#                     batch_size=1,
+#                 ),
+#                 EndpointConfig(
+#                     route="/predict/static-batch",
+#                     task="text-classification",
+#                     model=stub,
+#                     batch_size=2,
+#                 ),
+#             ],
+#             loggers={},  # do not instantiate any loggers
+#         )
+#         with mock_engine(rng_seed=0):
+#             app = _build_app(server_config)
+#         yield TestClient(app)
+#
+#     def test_static_batch_errors_on_wrong_batch_size(self, client):
+#         with pytest.raises(
+#             RuntimeError,
+#             match=(
+#                 "batch size of 1 passed into pipeline is "
+#                 "not divisible by model batch size of 2"
+#             ),
+#         ):
+#             client.post("/predict/static-batch", json={"sequences": "today is great"})
+#
+#     def test_static_batch_good_request(self, client):
+#         response = client.post(
+#             "/predict/static-batch",
+#             json={"sequences": ["today is great", "today is terrible"]},
+#         )
+#         assert response.status_code == 200
+#         output = response.json()
+#         assert len(output["labels"]) == 2
+#         assert len(output["scores"]) == 2
+#
+#     @pytest.mark.parametrize(
+#         "seqs",
+#         [
+#             ["today is great"],
+#             ["today is great", "today is terrible"],
+#             ["the first sentence", "the second sentence", "the third sentence"],
+#         ],
+#     )
+#     def test_dynamic_batch_any(self, client, seqs):
+#         response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
+#         assert response.status_code == 200
+#         output = response.json()
+#         assert len(output["labels"]) == len(seqs)
+#         assert len(output["scores"]) == len(seqs)
+#
+#
+# class TestDynamicEndpoints:
+#     @pytest.fixture(scope="class")
+#     def client(self):
+#         server_config = ServerConfig(
+#             num_cores=1, num_workers=1, endpoints=[], loggers=None
+#         )
+#         with mock_engine(rng_seed=0):
+#             app = _build_app(server_config)
+#             yield TestClient(app)
+#
+#
+# @mock_engine(rng_seed=0)
+# def test_dynamic_add_and_remove_endpoint(engine_mock):
+#     server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
+#     app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     # assert /predict doesn't exist
+#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+#
+#     # add /predict
+#     response = client.post(
+#         "/endpoints",
+#         json=EndpointConfig(task="text-classification", model="default").dict(),
+#     )
+#     assert response.status_code == 200
+#     response = client.post("/predict", json=dict(sequences="asdf"))
+#     assert response.status_code == 200
+#
+#     # remove /predict
+#     response = client.delete(
+#         "/endpoints",
+#         json=EndpointConfig(
+#             route="/predict", task="text-classification", model="default"
+#         ).dict(),
+#     )
+#     assert response.status_code == 200
+#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
diff --git a/tests/server/test_loggers.py b/tests/server/test_loggers.py
index 369215e9af..8802835381 100644
--- a/tests/server/test_loggers.py
+++ b/tests/server/test_loggers.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
+import shutil
 from collections import Counter
 from unittest import mock
 
@@ -57,246 +58,247 @@ def test_default_logger():
         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
     ), mock_engine(rng_seed=0):
         app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
-
-
-def test_data_logging_from_predefined():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name="text_classification",
-                model=stub,
-                add_predefined=[MetricFunctionConfig(func="text_classification")],
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-    client.post(
-        "/predict",
-        json={
-            "sequences": [["Fun for adults and children.", "Fun for only children."]]
-        },
-    )
-    calls = fetch_leaf_logger(server_logger).calls
-    data_logging_logs = [call for call in calls if "DATA" in call]
-    with open(
-        "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
-        "r",
-    ) as f:
-        expected_logs = f.read().splitlines()
-    for log, expected_log in zip(data_logging_logs, expected_logs):
-        assert log == expected_log
-
-
-@flaky(max_runs=4, min_passes=3)
-def test_logging_only_system_info():
-    server_config = ServerConfig(
-        endpoints=[EndpointConfig(task=task, name=name, model=stub)],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    _test_logger_contents(
-        fetch_leaf_logger(server_logger),
-        {"prediction_latency": 8},
-    )
-
-
-def test_regex_target_logging():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name=name,
-                data_logging={
-                    "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
-                },
-                model=stub,
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    _test_logger_contents(
-        fetch_leaf_logger(server_logger),
-        {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
-    )
-
-
-def test_multiple_targets_logging():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name=name,
-                data_logging={
-                    "pipeline_inputs.sequences": [
-                        MetricFunctionConfig(func="identity")
-                    ],
-                    "engine_inputs": [MetricFunctionConfig(func="identity")],
-                },
-                model=stub,
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    _test_logger_contents(
-        fetch_leaf_logger(server_logger),
-        {
-            "pipeline_inputs.sequences__identity": 2,
-            "engine_inputs__identity": 2,
-            "prediction_latency": 8,
-        },
-    )
-
-
-@flaky(max_runs=3, min_passes=2)
-def test_function_metric_with_target_loggers():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name=name,
-                data_logging={
-                    "pipeline_inputs.sequences[0]": [
-                        MetricFunctionConfig(
-                            func="identity", target_loggers=["logger_1"]
-                        )
-                    ],
-                    "engine_inputs": [MetricFunctionConfig(func="identity")],
-                },
-                model=stub,
-            )
-        ],
-        loggers={
-            "logger_1": {"path": logger_identifier},
-            "logger_2": {"path": logger_identifier},
-        },
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
+    # client = TestClient(app)
+    #
+    # for _ in range(2):
+    #     client.post("/predict", json={"sequences": "today is great"})
+    # assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
 
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-
-    _test_logger_contents(
-        server_logger.logger.loggers[1].logger.loggers[0],
-        {
-            "pipeline_inputs.sequences__identity": 2,
-            "engine_inputs__identity": 2,
-            "prediction_latency": 8,
-        },
-    )
-    _test_logger_contents(
-        server_logger.logger.loggers[1].logger.loggers[1],
-        {
-            "pipeline_inputs.sequences__identity": 0,
-            "engine_inputs__identity": 2,
-            "prediction_latency": 8,
-        },
-    )
 
-
-@mock_engine(rng_seed=0)
-def test_instantiate_prometheus(tmp_path):
-    client = TestClient(
-        _build_app(
-            ServerConfig(
-                endpoints=[EndpointConfig(task="text_classification", model="default")],
-                loggers=dict(
-                    prometheus={
-                        "port": find_free_port(),
-                        "text_log_save_dir": str(tmp_path),
-                        "text_log_save_frequency": 30,
-                    }
-                ),
-            )
-        )
-    )
-    r = client.post("/predict", json=dict(sequences="asdf"))
-    assert r.status_code == 200
-
-
-@mock_engine(rng_seed=0)
-def test_endpoint_system_logging(tmp_path):
-    server_config = ServerConfig(
-        system_logging=ServerSystemLoggingConfig(
-            request_details=SystemLoggingGroup(enable=True),
-            resource_utilization=SystemLoggingGroup(enable=True),
-        ),
-        endpoints=[
-            EndpointConfig(
-                task="text_classification",
-                model="default",
-                route="/predict_text_classification",
-                logging_config=PipelineSystemLoggingConfig(
-                    inference_details=SystemLoggingGroup(enable=True),
-                    prediction_latency=SystemLoggingGroup(enable=True),
-                ),
-            ),
-            EndpointConfig(
-                task="question_answering",
-                model="default",
-                route="/predict_question_answering",
-                logging_config=PipelineSystemLoggingConfig(
-                    inference_details=SystemLoggingGroup(enable=True),
-                    prediction_latency=SystemLoggingGroup(enable=True),
-                ),
-            ),
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-    client.post("/predict_text_classification", json=dict(sequences="asdf"))
-    client.post(
-        "/predict_text_classification", json=dict(question="asdf", context="asdf")
-    )
-    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-
-    c = Counter([call.split(",")[0] for call in calls])
-
-    assert c == SAMPLE_LOGS_DICT
+# def test_data_logging_from_predefined():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name="text_classification",
+#                 model=stub,
+#                 add_predefined=[MetricFunctionConfig(func="text_classification")],
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#     client.post(
+#         "/predict",
+#         json={
+#             "sequences": [["Fun for adults and children.", "Fun for only children."]]
+#         },
+#     )
+#     calls = fetch_leaf_logger(server_logger).calls
+#     data_logging_logs = [call for call in calls if "DATA" in call]
+#     with open(
+#         "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
+#         "r",
+#     ) as f:
+#         expected_logs = f.read().splitlines()
+#     for log, expected_log in zip(data_logging_logs, expected_logs):
+#         assert log == expected_log
+#
+#
+# @flaky(max_runs=4, min_passes=3)
+# def test_logging_only_system_info():
+#     server_config = ServerConfig(
+#         endpoints=[EndpointConfig(task=task, name=name, model=stub)],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#     _test_logger_contents(
+#         fetch_leaf_logger(server_logger),
+#         {"prediction_latency": 8},
+#     )
+#
+#
+# def test_regex_target_logging():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name=name,
+#                 data_logging={
+#                     "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
+#                 },
+#                 model=stub,
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#     _test_logger_contents(
+#         fetch_leaf_logger(server_logger),
+#         {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
+#     )
+#
+#
+# def test_multiple_targets_logging():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name=name,
+#                 data_logging={
+#                     "pipeline_inputs.sequences": [
+#                         MetricFunctionConfig(func="identity")
+#                     ],
+#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
+#                 },
+#                 model=stub,
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#     _test_logger_contents(
+#         fetch_leaf_logger(server_logger),
+#         {
+#             "pipeline_inputs.sequences__identity": 2,
+#             "engine_inputs__identity": 2,
+#             "prediction_latency": 8,
+#         },
+#     )
+#
+#
+# @flaky(max_runs=3, min_passes=2)
+# def test_function_metric_with_target_loggers():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name=name,
+#                 data_logging={
+#                     "pipeline_inputs.sequences[0]": [
+#                         MetricFunctionConfig(
+#                             func="identity", target_loggers=["logger_1"]
+#                         )
+#                     ],
+#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
+#                 },
+#                 model=stub,
+#             )
+#         ],
+#         loggers={
+#             "logger_1": {"path": logger_identifier},
+#             "logger_2": {"path": logger_identifier},
+#         },
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#
+#     _test_logger_contents(
+#         server_logger.logger.loggers[1].logger.loggers[0],
+#         {
+#             "pipeline_inputs.sequences__identity": 2,
+#             "engine_inputs__identity": 2,
+#             "prediction_latency": 8,
+#         },
+#     )
+#     _test_logger_contents(
+#         server_logger.logger.loggers[1].logger.loggers[1],
+#         {
+#             "pipeline_inputs.sequences__identity": 0,
+#             "engine_inputs__identity": 2,
+#             "prediction_latency": 8,
+#         },
+#     )
+#
+#
+# @mock_engine(rng_seed=0)
+# def test_instantiate_prometheus(mock_engine, tmp_path):
+#     client = TestClient(
+#         _build_app(
+#             ServerConfig(
+#                 endpoints=[EndpointConfig(task="text_classification", model="default")],
+#                 loggers=dict(
+#                     prometheus={
+#                         "port": find_free_port(),
+#                         "text_log_save_dir": tmp_path.name,
+#                         "text_log_save_frequency": 30,
+#                     }
+#                 ),
+#             )
+#         )
+#     )
+#     r = client.post("/predict", json=dict(sequences="asdf"))
+#     assert r.status_code == 200
+#     shutil.rmtree(tmp_path.name, ignore_errors=True)
+#
+#
+# @mock_engine(rng_seed=0)
+# def test_endpoint_system_logging(mock_engine):
+#     server_config = ServerConfig(
+#         system_logging=ServerSystemLoggingConfig(
+#             request_details=SystemLoggingGroup(enable=True),
+#             resource_utilization=SystemLoggingGroup(enable=True),
+#         ),
+#         endpoints=[
+#             EndpointConfig(
+#                 task="text_classification",
+#                 model="default",
+#                 route="/predict_text_classification",
+#                 logging_config=PipelineSystemLoggingConfig(
+#                     inference_details=SystemLoggingGroup(enable=True),
+#                     prediction_latency=SystemLoggingGroup(enable=True),
+#                 ),
+#             ),
+#             EndpointConfig(
+#                 task="question_answering",
+#                 model="default",
+#                 route="/predict_question_answering",
+#                 logging_config=PipelineSystemLoggingConfig(
+#                     inference_details=SystemLoggingGroup(enable=True),
+#                     prediction_latency=SystemLoggingGroup(enable=True),
+#                 ),
+#             ),
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine:
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#     client.post("/predict_text_classification", json=dict(sequences="asdf"))
+#     client.post(
+#         "/predict_text_classification", json=dict(question="asdf", context="asdf")
+#     )
+#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+#
+#     c = Counter([call.split(",")[0] for call in calls])
+#
+#     assert c == SAMPLE_LOGS_DICT
diff --git a/tests/server/test_system_logging.py b/tests/server/test_system_logging.py
index b6a3fbd2b6..bd0a8a3ae3 100644
--- a/tests/server/test_system_logging.py
+++ b/tests/server/test_system_logging.py
@@ -1,169 +1,169 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest import mock
-
-import pytest
-from deepsparse.loggers.config import SystemLoggingGroup
-from deepsparse.server.config import (
-    EndpointConfig,
-    ServerConfig,
-    ServerSystemLoggingConfig,
-)
-from deepsparse.server.helpers import server_logger_from_config
-from deepsparse.server.server import _build_app
-from deepsparse.server.system_logging import log_resource_utilization
-from fastapi.testclient import TestClient
-from tests.deepsparse.loggers.helpers import ListLogger
-from tests.utils import mock_engine
-
-
-logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
-stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
-task = "text-classification"
-name = "endpoint_name"
-
-
-def _test_successful_requests(calls, successful_request):
-    relevant_call = [
-        call
-        for call in calls
-        if call.startswith("identifier:request_details/successful_request_count")
-    ]
-    assert len(relevant_call) == 1
-    relevant_call = relevant_call[0]
-    value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
-    assert value == successful_request
-
-
-def _test_response_msg(calls, response_msg):
-    relevant_call = [
-        call
-        for call in calls
-        if call.startswith("identifier:request_details/response_message")
-    ]
-    assert len(relevant_call) == 1
-    relevant_call = relevant_call[0]
-    value = relevant_call.split("value:")[1].split(",")[0]
-    assert value == response_msg
-
-
-@pytest.mark.parametrize(
-    "json_payload, input_batch_size, successful_request, response_msg",
-    [
-        ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
-        (
-            {"sequences": ["today is great", "today is great"]},
-            2,
-            True,
-            "Response status code: 200",
-        ),
-        ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
-    ],
-)
-def test_log_request_details(
-    json_payload, input_batch_size, successful_request, response_msg
-):
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task, name=name, model=stub, batch_size=input_batch_size
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-        system_logging=ServerSystemLoggingConfig(
-            request_details=SystemLoggingGroup(enable=True)
-        ),
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-    client.post("/predict", json=json_payload)
-
-    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-
-    _test_successful_requests(calls, successful_request)
-    _test_response_msg(calls, response_msg)
-
-
-def _test_cpu_utilization(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
-    ]
-    assert len(relevant_calls) == num_iterations
-
-
-def _test_memory_utilization(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith("identifier:resource_utilization/memory_utilization_percent")
-    ]
-    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-    assert len(relevant_calls) == num_iterations
-    # memory utilization is a percentage, so it should be between 0 and 100
-    assert all(0.0 < value < 100.0 for value in values)
-
-
-def _test_total_memory_available(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith(
-            "identifier:resource_utilization/total_memory_available_bytes"
-        )
-    ]
-    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-    assert len(relevant_calls) == num_iterations
-    # assert all values are the same (total memory available is constant)
-    assert all(value == values[0] for value in values)
-
-
-def _test_additional_items_to_log(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith("identifier:resource_utilization/test")
-    ]
-    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-    assert len(relevant_calls) == num_iterations
-    # assert all values are the same ({"test" : 1} is constant)
-    assert all(value == 1 for value in values)
-
-
-@pytest.mark.parametrize(
-    "num_iterations, additional_items_to_log",
-    [
-        (5, {}),
-        (3, {"test": 1}),
-    ],
-)
-def test_log_resource_utilization(num_iterations, additional_items_to_log):
-    server_logger = ListLogger()
-
-    for iter in range(num_iterations):
-        log_resource_utilization(
-            server_logger, prefix="resource_utilization", **additional_items_to_log
-        )
-
-    calls = server_logger.calls
-
-    _test_cpu_utilization(calls, num_iterations)
-    _test_memory_utilization(calls, num_iterations)
-    _test_total_memory_available(calls, num_iterations)
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# from unittest import mock
+#
+# import pytest
+# from deepsparse.loggers.config import SystemLoggingGroup
+# from deepsparse.server.config import (
+#     EndpointConfig,
+#     ServerConfig,
+#     ServerSystemLoggingConfig,
+# )
+# from deepsparse.server.helpers import server_logger_from_config
+# from deepsparse.server.server import _build_app
+# from deepsparse.server.system_logging import log_resource_utilization
+# from fastapi.testclient import TestClient
+# from tests.deepsparse.loggers.helpers import ListLogger
+# from tests.utils import mock_engine
+#
+#
+# logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
+# stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
+# task = "text-classification"
+# name = "endpoint_name"
+#
+#
+# def _test_successful_requests(calls, successful_request):
+#     relevant_call = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:request_details/successful_request_count")
+#     ]
+#     assert len(relevant_call) == 1
+#     relevant_call = relevant_call[0]
+#     value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
+#     assert value == successful_request
+#
+#
+# def _test_response_msg(calls, response_msg):
+#     relevant_call = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:request_details/response_message")
+#     ]
+#     assert len(relevant_call) == 1
+#     relevant_call = relevant_call[0]
+#     value = relevant_call.split("value:")[1].split(",")[0]
+#     assert value == response_msg
+#
+#
+# @pytest.mark.parametrize(
+#     "json_payload, input_batch_size, successful_request, response_msg",
+#     [
+#         ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
+#         (
+#             {"sequences": ["today is great", "today is great"]},
+#             2,
+#             True,
+#             "Response status code: 200",
+#         ),
+#         ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
+#     ],
+# )
+# def test_log_request_details(
+#     json_payload, input_batch_size, successful_request, response_msg
+# ):
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task, name=name, model=stub, batch_size=input_batch_size
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#         system_logging=ServerSystemLoggingConfig(
+#             request_details=SystemLoggingGroup(enable=True)
+#         ),
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#     client.post("/predict", json=json_payload)
+#
+#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+#
+#     _test_successful_requests(calls, successful_request)
+#     _test_response_msg(calls, response_msg)
+#
+#
+# def _test_cpu_utilization(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
+#     ]
+#     assert len(relevant_calls) == num_iterations
+#
+#
+# def _test_memory_utilization(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:resource_utilization/memory_utilization_percent")
+#     ]
+#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+#     assert len(relevant_calls) == num_iterations
+#     # memory utilization is a percentage, so it should be between 0 and 100
+#     assert all(0.0 < value < 100.0 for value in values)
+#
+#
+# def _test_total_memory_available(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith(
+#             "identifier:resource_utilization/total_memory_available_bytes"
+#         )
+#     ]
+#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+#     assert len(relevant_calls) == num_iterations
+#     # assert all values are the same (total memory available is constant)
+#     assert all(value == values[0] for value in values)
+#
+#
+# def _test_additional_items_to_log(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:resource_utilization/test")
+#     ]
+#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+#     assert len(relevant_calls) == num_iterations
+#     # assert all values are the same ({"test" : 1} is constant)
+#     assert all(value == 1 for value in values)
+#
+#
+# @pytest.mark.parametrize(
+#     "num_iterations, additional_items_to_log",
+#     [
+#         (5, {}),
+#         (3, {"test": 1}),
+#     ],
+# )
+# def test_log_resource_utilization(num_iterations, additional_items_to_log):
+#     server_logger = ListLogger()
+#
+#     for iter in range(num_iterations):
+#         log_resource_utilization(
+#             server_logger, prefix="resource_utilization", **additional_items_to_log
+#         )
+#
+#     calls = server_logger.calls
+#
+#     _test_cpu_utilization(calls, num_iterations)
+#     _test_memory_utilization(calls, num_iterations)
+#     _test_total_memory_available(calls, num_iterations)

From 7f9935b8ea4456bf691a54c6585109576da39656 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 7 Jun 2023 16:14:46 +0000
Subject: [PATCH 04/68] ready to review

---
 src/deepsparse/transformers/helpers.py |  44 +-
 src/deepsparse/utils/onnx.py           |  30 +-
 src/deepsparse/yolo/utils/utils.py     |   7 +
 tests/server/test_app.py               | 332 +++++++--------
 tests/server/test_config.py            | 444 ++++++++++----------
 tests/server/test_endpoints.py         | 536 ++++++++++++-------------
 tests/server/test_loggers.py           | 486 +++++++++++-----------
 tests/server/test_system_logging.py    | 338 ++++++++--------
 8 files changed, 1110 insertions(+), 1107 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 847a7a9924..83b519baa5 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -135,7 +135,6 @@ def overwrite_transformer_onnx_model_inputs(
     path: str,
     batch_size: int = 1,
     max_length: int = 128,
-    output_path: Optional[str] = None,
     inplace: bool = True,
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
@@ -146,25 +145,16 @@ def overwrite_transformer_onnx_model_inputs(
     :param path: path to the ONNX model to override
     :param batch_size: batch size to set
     :param max_length: max sequence length to set
-    :param output_path: if provided, the model will be saved to the given path,
-        otherwise, the model will be saved to a named temporary file that will
-        be deleted after the program exits
-    :param inplace: if True, the model will be modified in place, otherwise
-        a copy of the model will be saved to a temporary file
-    :return: if no output path, a tuple of the saved path to the model, list of
-        model input names, and reference to the tempfile object will be returned
-        otherwise, only the model input names will be returned
+    :param inplace: if True, the model will be modified in place (its inputs will
+        be overwritten). Else, a copy of that model, with overwritten inputs,
+        will be saved to a temporary file
+    :return: tuple of (path to the overwritten model, list of input names that were
+        overwritten, and a temporary file containing the overwritten model if
+        `inplace=False`, else None)
     """
-
-    if inplace and output_path is not None:
-        raise ValueError(
-            "Cannot specify both inplace=True and output_path. If inplace=True, "
-            "the model will be modified in place (the returned path will be identical"
-            "to the input path specified in argument `path`)"
-        )
-    if inplace:
-        output_path = path
     # overwrite input shapes
+    # if > 2Gb model is to be modified in-place, operate
+    # exclusively on the model graph
     model = onnx.load(path, load_external_data=not inplace)
     initializer_input_names = set([node.name for node in model.graph.initializer])
     external_inputs = [
@@ -177,14 +167,20 @@ def overwrite_transformer_onnx_model_inputs(
         input_names.append(external_input.name)
 
     # Save modified model
-    if not inplace:
-        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
+    if inplace:
+        _LOGGER.info(
+            f"Overwriting in-place the input shapes of the transformer model at {path}"
+        )
+        save_onnx(model, path)
+        return path, input_names, None
+    else:
+        tmp_file = NamedTemporaryFile()
+        _LOGGER.info(
+            f"Saving a copy of the transformer model: {path} "
+            f"with overwritten input shapes to {tmp_file.name}"
+        )
         save_onnx(model, tmp_file.name)
         return tmp_file.name, input_names, tmp_file
-    else:
-        save_onnx(model, output_path)
-
-        return output_path, input_names, None
 
 
 def _get_file_parent(file_path: str) -> str:
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 00f5f24233..eb31179bc9 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -24,7 +24,7 @@
 from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
 
 from deepsparse.utils.extractor import Extractor
-from sparsezoo.utils import onnx_includes_external_data, save_onnx, validate_onnx
+from sparsezoo.utils import save_onnx, validate_onnx
 
 
 try:
@@ -60,21 +60,15 @@ def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) ->
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
-
-    if not onnx_includes_external_data(model) and with_external_data:
-        raise ValueError(
-            "Model does not include external data, it only includes the model graph."
-            "Cannot save its external data to separate a file."
-            "Set argument `with_external_data`=False"
-        )
     shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
-    _LOGGER.warning(f"Saving model to temporary directory: {tempfile.tempdir}")
+    _LOGGER.info(f"Saving model to temporary directory: {tempfile.tempdir}")
 
     if with_external_data:
         external_data = os.path.join(
             tempfile.tempdir, next(tempfile._get_candidate_names())
         )
         has_external_data = save_onnx(model, shaped_model.name, external_data)
+        _LOGGER.info(f"Saving external data to temporary directory: {external_data}")
     else:
         has_external_data = save_onnx(model, shaped_model.name)
     try:
@@ -218,7 +212,7 @@ def override_onnx_batch_size(
         external data are saved along the model graph.
     :param batch_size: Override for the batch size dimension
     :param inplace: If True, overwrite the original model file.
-        Else save the modified model to a temporary file.
+        Else, save the modified model to a temporary file.
     :return: File path to modified ONNX model.
         If inplace is True,
         the modified model will be saved to the same path as the original
@@ -234,12 +228,13 @@ def override_onnx_batch_size(
     for external_input in external_inputs:
         external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
 
-    # Save modified model, this will be cleaned up when context is exited
     if inplace:
+        _LOGGER.info(
+            f"Overwriting in-place the batch size of the model at {onnx_filepath}"
+        )
         save_onnx(model, onnx_filepath)
         return onnx_filepath
     else:
-        # Save modified model, this will be cleaned up when context is exited
         return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
@@ -302,12 +297,17 @@ def override_onnx_input_shapes(
         for dim_idx, dim in enumerate(external_input.type.tensor_type.shape.dim):
             dim.dim_value = input_shapes[input_idx][dim_idx]
 
-    # Save modified model, this will be cleaned up when context is exited
     if inplace:
+        _LOGGER.info(
+            "Overwriting in-place the input shapes of the model " f"at {onnx_filepath}"
+        )
         onnx.save(model, onnx_filepath)
         return onnx_filepath
     else:
-        # Save modified model, this will be cleaned up when context is exited
+        _LOGGER.info(
+            f"Saving the input shapes of the model at {onnx_filepath} "
+            f"to a temporary file"
+        )
         return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
@@ -387,7 +387,7 @@ def truncate_onnx_model(
             output.type.tensor_type.shape.Clear()
 
     # save and check model
-    _LOGGER.info("Saving truncated model to %s", output_filepath)
+    _LOGGER.debug(f"Saving truncated model to {output_filepath}")
     save_onnx(extracted_model, output_filepath, "external_data")
     validate_onnx(output_filepath)
 
diff --git a/src/deepsparse/yolo/utils/utils.py b/src/deepsparse/yolo/utils/utils.py
index 3a0f596fe1..e778fabe17 100644
--- a/src/deepsparse/yolo/utils/utils.py
+++ b/src/deepsparse/yolo/utils/utils.py
@@ -405,9 +405,16 @@ def modify_yolo_onnx_input_shape(
         set_tensor_dim_shape(model.graph.output[0], 1, num_predictions)
 
     if inplace:
+        _LOGGER.info(
+            "Overwriting in-place the ONNX model "
+            f"at {model_path} with the new input shape"
+        )
         save_onnx(model, model_path)
         return model_path
     else:
+        _LOGGER.info(
+            "Saving the ONNX model with the " "new input shape to a temporary file"
+        )
         return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
diff --git a/tests/server/test_app.py b/tests/server/test_app.py
index 678152adc9..9bc71e1a36 100644
--- a/tests/server/test_app.py
+++ b/tests/server/test_app.py
@@ -1,166 +1,166 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# import os
-# from copy import deepcopy
-# from re import escape
-# from unittest.mock import patch
-#
-# import pytest
-# from deepsparse.server.config import EndpointConfig, ServerConfig
-# from deepsparse.server.server import _build_app
-#
-#
-# def test_add_multiple_endpoints_with_no_route():
-#     with pytest.raises(
-#         ValueError,
-#         match=(
-#             "must specify `route` for all endpoints if multiple endpoints are used."
-#         ),
-#     ):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 endpoints=[
-#                     EndpointConfig(task="", model="", route=None),
-#                     EndpointConfig(task="", model="", route=None),
-#                 ],
-#                 loggers={},
-#             )
-#         )
-#
-#
-# def test_add_multiple_endpoints_with_same_route():
-#     with pytest.raises(ValueError, match="asdf specified 2 times"):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 endpoints=[
-#                     EndpointConfig(task="", model="", route="asdf"),
-#                     EndpointConfig(task="", model="", route="asdf"),
-#                 ],
-#                 loggers={},
-#             )
-#         )
-#
-#
-# def test_invalid_integration():
-#     with pytest.raises(
-#         ValueError,
-#         match=escape(
-#             "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
-#         ),
-#     ):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 integration="asdf",
-#                 endpoints=[],
-#                 loggers={},
-#             )
-#         )
-#
-#
-# def test_pytorch_num_threads():
-#     torch = pytest.importorskip("torch")
-#
-#     orig_num_threads = torch.get_num_threads()
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             pytorch_num_threads=None,
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert torch.get_num_threads() == orig_num_threads
-#
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             pytorch_num_threads=1,
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert torch.get_num_threads() == 1
-#
-#
-# @patch.dict(os.environ, deepcopy(os.environ))
-# def test_thread_pinning_none():
-#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             engine_thread_pinning="none",
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-#
-#
-# @patch.dict(os.environ, deepcopy(os.environ))
-# def test_thread_pinning_numa():
-#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             engine_thread_pinning="numa",
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
-#
-#
-# @patch.dict(os.environ, deepcopy(os.environ))
-# def test_thread_pinning_cores():
-#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             engine_thread_pinning="core",
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
-#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-#
-#
-# def test_invalid_thread_pinning():
-#     with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 engine_thread_pinning="asdf",
-#                 endpoints=[],
-#                 loggers={},
-#             )
-#         )
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from copy import deepcopy
+from re import escape
+from unittest.mock import patch
+
+import pytest
+from deepsparse.server.config import EndpointConfig, ServerConfig
+from deepsparse.server.server import _build_app
+
+
+def test_add_multiple_endpoints_with_no_route():
+    with pytest.raises(
+        ValueError,
+        match=(
+            "must specify `route` for all endpoints if multiple endpoints are used."
+        ),
+    ):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                endpoints=[
+                    EndpointConfig(task="", model="", route=None),
+                    EndpointConfig(task="", model="", route=None),
+                ],
+                loggers={},
+            )
+        )
+
+
+def test_add_multiple_endpoints_with_same_route():
+    with pytest.raises(ValueError, match="asdf specified 2 times"):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                endpoints=[
+                    EndpointConfig(task="", model="", route="asdf"),
+                    EndpointConfig(task="", model="", route="asdf"),
+                ],
+                loggers={},
+            )
+        )
+
+
+def test_invalid_integration():
+    with pytest.raises(
+        ValueError,
+        match=escape(
+            "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
+        ),
+    ):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                integration="asdf",
+                endpoints=[],
+                loggers={},
+            )
+        )
+
+
+def test_pytorch_num_threads():
+    torch = pytest.importorskip("torch")
+
+    orig_num_threads = torch.get_num_threads()
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            pytorch_num_threads=None,
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert torch.get_num_threads() == orig_num_threads
+
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            pytorch_num_threads=1,
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert torch.get_num_threads() == 1
+
+
+@patch.dict(os.environ, deepcopy(os.environ))
+def test_thread_pinning_none():
+    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            engine_thread_pinning="none",
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+
+
+@patch.dict(os.environ, deepcopy(os.environ))
+def test_thread_pinning_numa():
+    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            engine_thread_pinning="numa",
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
+
+
+@patch.dict(os.environ, deepcopy(os.environ))
+def test_thread_pinning_cores():
+    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            engine_thread_pinning="core",
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
+    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+
+
+def test_invalid_thread_pinning():
+    with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                engine_thread_pinning="asdf",
+                endpoints=[],
+                loggers={},
+            )
+        )
diff --git a/tests/server/test_config.py b/tests/server/test_config.py
index f2f9b0e6fe..b1c1c75a84 100644
--- a/tests/server/test_config.py
+++ b/tests/server/test_config.py
@@ -1,222 +1,222 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# import yaml
-#
-# import pytest
-# from deepsparse.server.config import (
-#     EndpointConfig,
-#     ImageSizesConfig,
-#     MetricFunctionConfig,
-#     SequenceLengthsConfig,
-#     ServerConfig,
-# )
-#
-#
-# def test_no_bucketing_config():
-#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
-#     assert cfg.input_shapes is None
-#     assert cfg.kwargs == {}
-#
-#
-# @pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
-# def test_bucketing_sequence_length_for_cv(task):
-#     with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
-#         EndpointConfig(
-#             task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-#         ).to_pipeline_config()
-#
-#
-# @pytest.mark.parametrize(
-#     "task", ["question_answering", "text_classification", "token_classification"]
-# )
-# def test_bucketing_image_size_for_nlp(task):
-#     with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
-#         EndpointConfig(
-#             task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
-#         ).to_pipeline_config()
-#
-#
-# def test_bucketing_zero_sequence_length():
-#     with pytest.raises(ValueError, match="at least one sequence length"):
-#         EndpointConfig(
-#             task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-#         ).to_pipeline_config()
-#
-#
-# def test_bucketing_zero_image_size():
-#     with pytest.raises(ValueError, match="at least one image size"):
-#         EndpointConfig(
-#             task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
-#         ).to_pipeline_config()
-#
-#
-# def test_bucketing_one_sequence_length():
-#     cfg = EndpointConfig(
-#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
-#     ).to_pipeline_config()
-#     assert cfg.input_shapes is None
-#     assert cfg.kwargs == {"sequence_length": 32}
-#
-#
-# def test_bucketing_multi_sequence_length():
-#     cfg = EndpointConfig(
-#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
-#     ).to_pipeline_config()
-#     assert cfg.input_shapes is None
-#     assert cfg.kwargs == {"sequence_length": [32, 64]}
-#
-#
-# def test_bucketing_one_image_size():
-#     cfg = EndpointConfig(
-#         task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
-#     ).to_pipeline_config()
-#     assert cfg.input_shapes == [[256, 256]]
-#     assert cfg.kwargs == {}
-#
-#
-# def test_endpoint_config_to_pipeline_copy_fields():
-#     cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
-#     assert cfg.task == "qa"
-#     assert cfg.model_path == "zxcv"
-#
-#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
-#     assert cfg.batch_size == 1
-#
-#     cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
-#     assert cfg.batch_size == 64
-#
-#
-# def test_yaml_load_config(tmp_path):
-#     server_config = ServerConfig(
-#         num_cores=1,
-#         num_workers=2,
-#         integration="sagemaker",
-#         endpoints=[
-#             EndpointConfig(
-#                 name="asdf",
-#                 route="qwer",
-#                 task="uiop",
-#                 model="hjkl",
-#                 batch_size=1,
-#                 bucketing=None,
-#             ),
-#             EndpointConfig(
-#                 name="asdfd",
-#                 route="qwer",
-#                 task="uiop",
-#                 model="hjkl",
-#                 batch_size=2,
-#                 bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
-#             ),
-#             EndpointConfig(
-#                 name="asdfde",
-#                 route="qwer",
-#                 task="uiop",
-#                 model="hjkl",
-#                 batch_size=3,
-#                 bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
-#             ),
-#         ],
-#         loggers={},
-#     )
-#
-#     path = tmp_path / "config.yaml"
-#     with open(path, "w") as fp:
-#         yaml.dump(server_config.dict(), fp)
-#
-#     with open(path) as fp:
-#         obj = yaml.load(fp, Loader=yaml.Loader)
-#     server_config2 = ServerConfig(**obj)
-#     assert server_config == server_config2
-#
-#
-# metric_function_config_yaml_1 = """
-#   func: identity
-#   frequency: 5
-#   loggers:
-#     - python"""
-#
-# metric_function_config_yaml_2 = """
-#   func: numpy.max"""
-#
-# metric_function_config_yaml_3 = """
-#   func: numpy.max
-#   frequency: 0"""
-#
-#
-# @pytest.mark.parametrize(
-#     "config_yaml, should_fail, instance_type",
-#     [
-#         (metric_function_config_yaml_1, False, MetricFunctionConfig),
-#         (metric_function_config_yaml_2, False, MetricFunctionConfig),
-#         (
-#             metric_function_config_yaml_3,
-#             True,
-#             MetricFunctionConfig,
-#         ),  # frequency cannot be zero
-#     ],
-# )
-# def test_function_logging_config(config_yaml, should_fail, instance_type):
-#     obj = yaml.safe_load(config_yaml)
-#     if should_fail:
-#         with pytest.raises(Exception):
-#             MetricFunctionConfig(**obj)
-#     else:
-#         assert MetricFunctionConfig(**obj)
-#
-#
-# def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
-#     return ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 name=endpoint_1_name,
-#                 task=task_name,
-#                 model="hjkl",
-#             ),
-#             EndpointConfig(
-#                 name=endpoint_2_name,
-#                 task=task_name,
-#                 model="hjkl",
-#             ),
-#         ]
-#     )
-#
-#
-# @pytest.mark.parametrize(
-#     "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
-#     [
-#         ("some_task", None, None, False, "some_task-0", "some_task-1"),
-#         ("some_task", "name_1", None, False, "name_1", "some_task-0"),
-#         ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
-#         ("some_task", "name_1", "name_1", True, None, None),
-#     ],
-# )
-# def test_unique_endpoint_names(
-#     task_name,
-#     endpoint_1_name,
-#     endpoint_2_name,
-#     raise_error,
-#     expected_endpoint_1_name,
-#     expected_endpoint_2_name,
-# ):
-#     if raise_error:
-#         with pytest.raises(ValueError):
-#             _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-#             return
-#         return
-#
-#     server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-#     assert server_config.endpoints[0].name == expected_endpoint_1_name
-#     assert server_config.endpoints[1].name == expected_endpoint_2_name
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+
+import pytest
+from deepsparse.server.config import (
+    EndpointConfig,
+    ImageSizesConfig,
+    MetricFunctionConfig,
+    SequenceLengthsConfig,
+    ServerConfig,
+)
+
+
+def test_no_bucketing_config():
+    cfg = EndpointConfig(task="", model="").to_pipeline_config()
+    assert cfg.input_shapes is None
+    assert cfg.kwargs == {}
+
+
+@pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
+def test_bucketing_sequence_length_for_cv(task):
+    with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
+        EndpointConfig(
+            task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+        ).to_pipeline_config()
+
+
+@pytest.mark.parametrize(
+    "task", ["question_answering", "text_classification", "token_classification"]
+)
+def test_bucketing_image_size_for_nlp(task):
+    with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
+        EndpointConfig(
+            task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
+        ).to_pipeline_config()
+
+
+def test_bucketing_zero_sequence_length():
+    with pytest.raises(ValueError, match="at least one sequence length"):
+        EndpointConfig(
+            task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+        ).to_pipeline_config()
+
+
+def test_bucketing_zero_image_size():
+    with pytest.raises(ValueError, match="at least one image size"):
+        EndpointConfig(
+            task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
+        ).to_pipeline_config()
+
+
+def test_bucketing_one_sequence_length():
+    cfg = EndpointConfig(
+        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
+    ).to_pipeline_config()
+    assert cfg.input_shapes is None
+    assert cfg.kwargs == {"sequence_length": 32}
+
+
+def test_bucketing_multi_sequence_length():
+    cfg = EndpointConfig(
+        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
+    ).to_pipeline_config()
+    assert cfg.input_shapes is None
+    assert cfg.kwargs == {"sequence_length": [32, 64]}
+
+
+def test_bucketing_one_image_size():
+    cfg = EndpointConfig(
+        task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
+    ).to_pipeline_config()
+    assert cfg.input_shapes == [[256, 256]]
+    assert cfg.kwargs == {}
+
+
+def test_endpoint_config_to_pipeline_copy_fields():
+    cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
+    assert cfg.task == "qa"
+    assert cfg.model_path == "zxcv"
+
+    cfg = EndpointConfig(task="", model="").to_pipeline_config()
+    assert cfg.batch_size == 1
+
+    cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
+    assert cfg.batch_size == 64
+
+
+def test_yaml_load_config(tmp_path):
+    server_config = ServerConfig(
+        num_cores=1,
+        num_workers=2,
+        integration="sagemaker",
+        endpoints=[
+            EndpointConfig(
+                name="asdf",
+                route="qwer",
+                task="uiop",
+                model="hjkl",
+                batch_size=1,
+                bucketing=None,
+            ),
+            EndpointConfig(
+                name="asdfd",
+                route="qwer",
+                task="uiop",
+                model="hjkl",
+                batch_size=2,
+                bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
+            ),
+            EndpointConfig(
+                name="asdfde",
+                route="qwer",
+                task="uiop",
+                model="hjkl",
+                batch_size=3,
+                bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
+            ),
+        ],
+        loggers={},
+    )
+
+    path = tmp_path / "config.yaml"
+    with open(path, "w") as fp:
+        yaml.dump(server_config.dict(), fp)
+
+    with open(path) as fp:
+        obj = yaml.load(fp, Loader=yaml.Loader)
+    server_config2 = ServerConfig(**obj)
+    assert server_config == server_config2
+
+
+metric_function_config_yaml_1 = """
+  func: identity
+  frequency: 5
+  loggers:
+    - python"""
+
+metric_function_config_yaml_2 = """
+  func: numpy.max"""
+
+metric_function_config_yaml_3 = """
+  func: numpy.max
+  frequency: 0"""
+
+
+@pytest.mark.parametrize(
+    "config_yaml, should_fail, instance_type",
+    [
+        (metric_function_config_yaml_1, False, MetricFunctionConfig),
+        (metric_function_config_yaml_2, False, MetricFunctionConfig),
+        (
+            metric_function_config_yaml_3,
+            True,
+            MetricFunctionConfig,
+        ),  # frequency cannot be zero
+    ],
+)
+def test_function_logging_config(config_yaml, should_fail, instance_type):
+    obj = yaml.safe_load(config_yaml)
+    if should_fail:
+        with pytest.raises(Exception):
+            MetricFunctionConfig(**obj)
+    else:
+        assert MetricFunctionConfig(**obj)
+
+
+def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
+    return ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                name=endpoint_1_name,
+                task=task_name,
+                model="hjkl",
+            ),
+            EndpointConfig(
+                name=endpoint_2_name,
+                task=task_name,
+                model="hjkl",
+            ),
+        ]
+    )
+
+
+@pytest.mark.parametrize(
+    "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
+    [
+        ("some_task", None, None, False, "some_task-0", "some_task-1"),
+        ("some_task", "name_1", None, False, "name_1", "some_task-0"),
+        ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
+        ("some_task", "name_1", "name_1", True, None, None),
+    ],
+)
+def test_unique_endpoint_names(
+    task_name,
+    endpoint_1_name,
+    endpoint_2_name,
+    raise_error,
+    expected_endpoint_1_name,
+    expected_endpoint_2_name,
+):
+    if raise_error:
+        with pytest.raises(ValueError):
+            _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+            return
+        return
+
+    server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+    assert server_config.endpoints[0].name == expected_endpoint_1_name
+    assert server_config.endpoints[1].name == expected_endpoint_2_name
diff --git a/tests/server/test_endpoints.py b/tests/server/test_endpoints.py
index 411fb46446..f028b37e75 100644
--- a/tests/server/test_endpoints.py
+++ b/tests/server/test_endpoints.py
@@ -1,268 +1,268 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# from typing import List
-# from unittest.mock import Mock
-#
-# from pydantic import BaseModel
-#
-# import pytest
-# from deepsparse.loggers import MultiLogger
-# from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
-# from deepsparse.server.server import _add_pipeline_endpoint, _build_app
-# from fastapi import FastAPI, UploadFile
-# from fastapi.testclient import TestClient
-# from tests.utils import mock_engine
-#
-#
-# class FromFilesSchema(BaseModel):
-#     def from_files(self, f):
-#         # do nothing - this method exists just to test files endpoint logic
-#         ...
-#
-#
-# class StrSchema(BaseModel):
-#     value: str
-#
-#
-# def parse(v: StrSchema) -> int:
-#     return int(v.value)
-#
-#
-# class TestStatusEndpoints:
-#     @pytest.fixture(scope="class")
-#     def server_config(self):
-#         server_config = ServerConfig(
-#             num_cores=1, num_workers=1, endpoints=[], loggers={}
-#         )
-#         yield server_config
-#
-#     @pytest.fixture(scope="class")
-#     def client(self, server_config):
-#         yield TestClient(_build_app(server_config))
-#
-#     def test_config(self, server_config, client):
-#         response = client.get("/config")
-#         loaded = ServerConfig(**response.json())
-#         assert loaded == server_config
-#
-#     @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
-#     def test_pings_exist(self, client, route):
-#         response = client.get(route)
-#         assert response.status_code == 200
-#         assert response.json() is True
-#
-#     def test_docs_exist(self, client):
-#         assert client.get("/docs").status_code == 200
-#
-#     def test_home_redirects_to_docs(self, client):
-#         response = client.get("/")
-#         assert response.status_code == 200
-#         assert response.request.path_url == "/docs"
-#         assert len(response.history) > 0
-#         assert response.history[-1].is_redirect
-#
-#
-# class TestMockEndpoints:
-#     @pytest.fixture(scope="class")
-#     def server_config(self):
-#         server_config = ServerConfig(
-#             num_cores=1, num_workers=1, endpoints=[], loggers={}
-#         )
-#         yield server_config
-#
-#     @pytest.fixture(scope="class")
-#     def app(self, server_config):
-#         yield _build_app(server_config)
-#
-#     @pytest.fixture(scope="class")
-#     def client(self, app):
-#         yield TestClient(app)
-#
-#     def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
-#         mock_pipeline = Mock(
-#             side_effect=parse,
-#             input_schema=StrSchema,
-#             output_schema=int,
-#             logger=MultiLogger([]),
-#         )
-#         _add_pipeline_endpoint(
-#             app,
-#             system_logging_config=SystemLoggingConfig(),
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             pipeline=mock_pipeline,
-#         )
-#         assert app.routes[-1].path == "/predict/parse_int"
-#         assert app.routes[-1].response_model is int
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-#         assert app.routes[-1].methods == {"POST"}
-#
-#         for v in ["1234", "5678"]:
-#             response = client.post("/predict/parse_int", json=dict(value=v))
-#             assert response.status_code == 200
-#             assert response.json() == int(v)
-#
-#     def test_add_model_endpoint_with_from_files(self, app):
-#         _add_pipeline_endpoint(
-#             app,
-#             system_logging_config=Mock(),
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-#         )
-#         assert app.routes[-2].path == "/predict/parse_int"
-#         assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
-#         assert app.routes[-1].path == "/predict/parse_int/from_files"
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-#         assert app.routes[-1].response_model is int
-#         assert app.routes[-1].methods == {"POST"}
-#
-#     def test_sagemaker_only_adds_one_endpoint(self, app):
-#         num_routes = len(app.routes)
-#         _add_pipeline_endpoint(
-#             app,
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             system_logging_config=Mock(),
-#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-#             integration="sagemaker",
-#         )
-#         assert len(app.routes) == num_routes + 1
-#         assert app.routes[-1].path == "/invocations"
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-#
-#         num_routes = len(app.routes)
-#         _add_pipeline_endpoint(
-#             app,
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             system_logging_config=Mock(),
-#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
-#             integration="sagemaker",
-#         )
-#         assert len(app.routes) == num_routes + 1
-#         assert app.routes[-1].path == "/invocations"
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-#
-#     def test_add_endpoint_with_no_route_specified(self, app):
-#         _add_pipeline_endpoint(
-#             app,
-#             endpoint_config=Mock(route=None),
-#             system_logging_config=Mock(),
-#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
-#         )
-#         assert app.routes[-1].path == "/predict"
-#
-#
-# class TestActualModelEndpoints:
-#     @pytest.fixture(scope="class")
-#     def client(self):
-#         stub = (
-#             "zoo:nlp/text_classification/distilbert-none/"
-#             "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
-#         )
-#         server_config = ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             endpoints=[
-#                 EndpointConfig(
-#                     route="/predict/dynamic-batch",
-#                     task="text-classification",
-#                     model=stub,
-#                     batch_size=1,
-#                 ),
-#                 EndpointConfig(
-#                     route="/predict/static-batch",
-#                     task="text-classification",
-#                     model=stub,
-#                     batch_size=2,
-#                 ),
-#             ],
-#             loggers={},  # do not instantiate any loggers
-#         )
-#         with mock_engine(rng_seed=0):
-#             app = _build_app(server_config)
-#         yield TestClient(app)
-#
-#     def test_static_batch_errors_on_wrong_batch_size(self, client):
-#         with pytest.raises(
-#             RuntimeError,
-#             match=(
-#                 "batch size of 1 passed into pipeline is "
-#                 "not divisible by model batch size of 2"
-#             ),
-#         ):
-#             client.post("/predict/static-batch", json={"sequences": "today is great"})
-#
-#     def test_static_batch_good_request(self, client):
-#         response = client.post(
-#             "/predict/static-batch",
-#             json={"sequences": ["today is great", "today is terrible"]},
-#         )
-#         assert response.status_code == 200
-#         output = response.json()
-#         assert len(output["labels"]) == 2
-#         assert len(output["scores"]) == 2
-#
-#     @pytest.mark.parametrize(
-#         "seqs",
-#         [
-#             ["today is great"],
-#             ["today is great", "today is terrible"],
-#             ["the first sentence", "the second sentence", "the third sentence"],
-#         ],
-#     )
-#     def test_dynamic_batch_any(self, client, seqs):
-#         response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
-#         assert response.status_code == 200
-#         output = response.json()
-#         assert len(output["labels"]) == len(seqs)
-#         assert len(output["scores"]) == len(seqs)
-#
-#
-# class TestDynamicEndpoints:
-#     @pytest.fixture(scope="class")
-#     def client(self):
-#         server_config = ServerConfig(
-#             num_cores=1, num_workers=1, endpoints=[], loggers=None
-#         )
-#         with mock_engine(rng_seed=0):
-#             app = _build_app(server_config)
-#             yield TestClient(app)
-#
-#
-# @mock_engine(rng_seed=0)
-# def test_dynamic_add_and_remove_endpoint(engine_mock):
-#     server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
-#     app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     # assert /predict doesn't exist
-#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
-#
-#     # add /predict
-#     response = client.post(
-#         "/endpoints",
-#         json=EndpointConfig(task="text-classification", model="default").dict(),
-#     )
-#     assert response.status_code == 200
-#     response = client.post("/predict", json=dict(sequences="asdf"))
-#     assert response.status_code == 200
-#
-#     # remove /predict
-#     response = client.delete(
-#         "/endpoints",
-#         json=EndpointConfig(
-#             route="/predict", task="text-classification", model="default"
-#         ).dict(),
-#     )
-#     assert response.status_code == 200
-#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+from unittest.mock import Mock
+
+from pydantic import BaseModel
+
+import pytest
+from deepsparse.loggers import MultiLogger
+from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
+from deepsparse.server.server import _add_pipeline_endpoint, _build_app
+from fastapi import FastAPI, UploadFile
+from fastapi.testclient import TestClient
+from tests.utils import mock_engine
+
+
+class FromFilesSchema(BaseModel):
+    def from_files(self, f):
+        # do nothing - this method exists just to test files endpoint logic
+        ...
+
+
+class StrSchema(BaseModel):
+    value: str
+
+
+def parse(v: StrSchema) -> int:
+    return int(v.value)
+
+
+class TestStatusEndpoints:
+    @pytest.fixture(scope="class")
+    def server_config(self):
+        server_config = ServerConfig(
+            num_cores=1, num_workers=1, endpoints=[], loggers={}
+        )
+        yield server_config
+
+    @pytest.fixture(scope="class")
+    def client(self, server_config):
+        yield TestClient(_build_app(server_config))
+
+    def test_config(self, server_config, client):
+        response = client.get("/config")
+        loaded = ServerConfig(**response.json())
+        assert loaded == server_config
+
+    @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
+    def test_pings_exist(self, client, route):
+        response = client.get(route)
+        assert response.status_code == 200
+        assert response.json() is True
+
+    def test_docs_exist(self, client):
+        assert client.get("/docs").status_code == 200
+
+    def test_home_redirects_to_docs(self, client):
+        response = client.get("/")
+        assert response.status_code == 200
+        assert response.request.path_url == "/docs"
+        assert len(response.history) > 0
+        assert response.history[-1].is_redirect
+
+
+class TestMockEndpoints:
+    @pytest.fixture(scope="class")
+    def server_config(self):
+        server_config = ServerConfig(
+            num_cores=1, num_workers=1, endpoints=[], loggers={}
+        )
+        yield server_config
+
+    @pytest.fixture(scope="class")
+    def app(self, server_config):
+        yield _build_app(server_config)
+
+    @pytest.fixture(scope="class")
+    def client(self, app):
+        yield TestClient(app)
+
+    def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
+        mock_pipeline = Mock(
+            side_effect=parse,
+            input_schema=StrSchema,
+            output_schema=int,
+            logger=MultiLogger([]),
+        )
+        _add_pipeline_endpoint(
+            app,
+            system_logging_config=SystemLoggingConfig(),
+            endpoint_config=Mock(route="/predict/parse_int"),
+            pipeline=mock_pipeline,
+        )
+        assert app.routes[-1].path == "/predict/parse_int"
+        assert app.routes[-1].response_model is int
+        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+        assert app.routes[-1].methods == {"POST"}
+
+        for v in ["1234", "5678"]:
+            response = client.post("/predict/parse_int", json=dict(value=v))
+            assert response.status_code == 200
+            assert response.json() == int(v)
+
+    def test_add_model_endpoint_with_from_files(self, app):
+        _add_pipeline_endpoint(
+            app,
+            system_logging_config=Mock(),
+            endpoint_config=Mock(route="/predict/parse_int"),
+            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+        )
+        assert app.routes[-2].path == "/predict/parse_int"
+        assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
+        assert app.routes[-1].path == "/predict/parse_int/from_files"
+        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+        assert app.routes[-1].response_model is int
+        assert app.routes[-1].methods == {"POST"}
+
+    def test_sagemaker_only_adds_one_endpoint(self, app):
+        num_routes = len(app.routes)
+        _add_pipeline_endpoint(
+            app,
+            endpoint_config=Mock(route="/predict/parse_int"),
+            system_logging_config=Mock(),
+            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+            integration="sagemaker",
+        )
+        assert len(app.routes) == num_routes + 1
+        assert app.routes[-1].path == "/invocations"
+        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+
+        num_routes = len(app.routes)
+        _add_pipeline_endpoint(
+            app,
+            endpoint_config=Mock(route="/predict/parse_int"),
+            system_logging_config=Mock(),
+            pipeline=Mock(input_schema=StrSchema, output_schema=int),
+            integration="sagemaker",
+        )
+        assert len(app.routes) == num_routes + 1
+        assert app.routes[-1].path == "/invocations"
+        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+
+    def test_add_endpoint_with_no_route_specified(self, app):
+        _add_pipeline_endpoint(
+            app,
+            endpoint_config=Mock(route=None),
+            system_logging_config=Mock(),
+            pipeline=Mock(input_schema=StrSchema, output_schema=int),
+        )
+        assert app.routes[-1].path == "/predict"
+
+
+class TestActualModelEndpoints:
+    @pytest.fixture(scope="class")
+    def client(self):
+        stub = (
+            "zoo:nlp/text_classification/distilbert-none/"
+            "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
+        )
+        server_config = ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            endpoints=[
+                EndpointConfig(
+                    route="/predict/dynamic-batch",
+                    task="text-classification",
+                    model=stub,
+                    batch_size=1,
+                ),
+                EndpointConfig(
+                    route="/predict/static-batch",
+                    task="text-classification",
+                    model=stub,
+                    batch_size=2,
+                ),
+            ],
+            loggers={},  # do not instantiate any loggers
+        )
+        with mock_engine(rng_seed=0):
+            app = _build_app(server_config)
+        yield TestClient(app)
+
+    def test_static_batch_errors_on_wrong_batch_size(self, client):
+        with pytest.raises(
+            RuntimeError,
+            match=(
+                "batch size of 1 passed into pipeline is "
+                "not divisible by model batch size of 2"
+            ),
+        ):
+            client.post("/predict/static-batch", json={"sequences": "today is great"})
+
+    def test_static_batch_good_request(self, client):
+        response = client.post(
+            "/predict/static-batch",
+            json={"sequences": ["today is great", "today is terrible"]},
+        )
+        assert response.status_code == 200
+        output = response.json()
+        assert len(output["labels"]) == 2
+        assert len(output["scores"]) == 2
+
+    @pytest.mark.parametrize(
+        "seqs",
+        [
+            ["today is great"],
+            ["today is great", "today is terrible"],
+            ["the first sentence", "the second sentence", "the third sentence"],
+        ],
+    )
+    def test_dynamic_batch_any(self, client, seqs):
+        response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
+        assert response.status_code == 200
+        output = response.json()
+        assert len(output["labels"]) == len(seqs)
+        assert len(output["scores"]) == len(seqs)
+
+
+class TestDynamicEndpoints:
+    @pytest.fixture(scope="class")
+    def client(self):
+        server_config = ServerConfig(
+            num_cores=1, num_workers=1, endpoints=[], loggers=None
+        )
+        with mock_engine(rng_seed=0):
+            app = _build_app(server_config)
+            yield TestClient(app)
+
+
+@mock_engine(rng_seed=0)
+def test_dynamic_add_and_remove_endpoint(engine_mock):
+    server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
+    app = _build_app(server_config)
+    client = TestClient(app)
+
+    # assert /predict doesn't exist
+    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+
+    # add /predict
+    response = client.post(
+        "/endpoints",
+        json=EndpointConfig(task="text-classification", model="default").dict(),
+    )
+    assert response.status_code == 200
+    response = client.post("/predict", json=dict(sequences="asdf"))
+    assert response.status_code == 200
+
+    # remove /predict
+    response = client.delete(
+        "/endpoints",
+        json=EndpointConfig(
+            route="/predict", task="text-classification", model="default"
+        ).dict(),
+    )
+    assert response.status_code == 200
+    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
diff --git a/tests/server/test_loggers.py b/tests/server/test_loggers.py
index 8802835381..ce2576c09f 100644
--- a/tests/server/test_loggers.py
+++ b/tests/server/test_loggers.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+
 import shutil
 from collections import Counter
 from unittest import mock
@@ -58,247 +58,247 @@ def test_default_logger():
         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
     ), mock_engine(rng_seed=0):
         app = _build_app(server_config)
-    # client = TestClient(app)
-    #
-    # for _ in range(2):
-    #     client.post("/predict", json={"sequences": "today is great"})
-    # assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
+    client = TestClient(app)
 
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
 
-# def test_data_logging_from_predefined():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name="text_classification",
-#                 model=stub,
-#                 add_predefined=[MetricFunctionConfig(func="text_classification")],
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#     client.post(
-#         "/predict",
-#         json={
-#             "sequences": [["Fun for adults and children.", "Fun for only children."]]
-#         },
-#     )
-#     calls = fetch_leaf_logger(server_logger).calls
-#     data_logging_logs = [call for call in calls if "DATA" in call]
-#     with open(
-#         "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
-#         "r",
-#     ) as f:
-#         expected_logs = f.read().splitlines()
-#     for log, expected_log in zip(data_logging_logs, expected_logs):
-#         assert log == expected_log
-#
-#
-# @flaky(max_runs=4, min_passes=3)
-# def test_logging_only_system_info():
-#     server_config = ServerConfig(
-#         endpoints=[EndpointConfig(task=task, name=name, model=stub)],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#     _test_logger_contents(
-#         fetch_leaf_logger(server_logger),
-#         {"prediction_latency": 8},
-#     )
-#
-#
-# def test_regex_target_logging():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name=name,
-#                 data_logging={
-#                     "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
-#                 },
-#                 model=stub,
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#     _test_logger_contents(
-#         fetch_leaf_logger(server_logger),
-#         {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
-#     )
-#
-#
-# def test_multiple_targets_logging():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name=name,
-#                 data_logging={
-#                     "pipeline_inputs.sequences": [
-#                         MetricFunctionConfig(func="identity")
-#                     ],
-#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
-#                 },
-#                 model=stub,
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#     _test_logger_contents(
-#         fetch_leaf_logger(server_logger),
-#         {
-#             "pipeline_inputs.sequences__identity": 2,
-#             "engine_inputs__identity": 2,
-#             "prediction_latency": 8,
-#         },
-#     )
-#
-#
-# @flaky(max_runs=3, min_passes=2)
-# def test_function_metric_with_target_loggers():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name=name,
-#                 data_logging={
-#                     "pipeline_inputs.sequences[0]": [
-#                         MetricFunctionConfig(
-#                             func="identity", target_loggers=["logger_1"]
-#                         )
-#                     ],
-#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
-#                 },
-#                 model=stub,
-#             )
-#         ],
-#         loggers={
-#             "logger_1": {"path": logger_identifier},
-#             "logger_2": {"path": logger_identifier},
-#         },
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#
-#     _test_logger_contents(
-#         server_logger.logger.loggers[1].logger.loggers[0],
-#         {
-#             "pipeline_inputs.sequences__identity": 2,
-#             "engine_inputs__identity": 2,
-#             "prediction_latency": 8,
-#         },
-#     )
-#     _test_logger_contents(
-#         server_logger.logger.loggers[1].logger.loggers[1],
-#         {
-#             "pipeline_inputs.sequences__identity": 0,
-#             "engine_inputs__identity": 2,
-#             "prediction_latency": 8,
-#         },
-#     )
-#
-#
-# @mock_engine(rng_seed=0)
-# def test_instantiate_prometheus(mock_engine, tmp_path):
-#     client = TestClient(
-#         _build_app(
-#             ServerConfig(
-#                 endpoints=[EndpointConfig(task="text_classification", model="default")],
-#                 loggers=dict(
-#                     prometheus={
-#                         "port": find_free_port(),
-#                         "text_log_save_dir": tmp_path.name,
-#                         "text_log_save_frequency": 30,
-#                     }
-#                 ),
-#             )
-#         )
-#     )
-#     r = client.post("/predict", json=dict(sequences="asdf"))
-#     assert r.status_code == 200
-#     shutil.rmtree(tmp_path.name, ignore_errors=True)
-#
-#
-# @mock_engine(rng_seed=0)
-# def test_endpoint_system_logging(mock_engine):
-#     server_config = ServerConfig(
-#         system_logging=ServerSystemLoggingConfig(
-#             request_details=SystemLoggingGroup(enable=True),
-#             resource_utilization=SystemLoggingGroup(enable=True),
-#         ),
-#         endpoints=[
-#             EndpointConfig(
-#                 task="text_classification",
-#                 model="default",
-#                 route="/predict_text_classification",
-#                 logging_config=PipelineSystemLoggingConfig(
-#                     inference_details=SystemLoggingGroup(enable=True),
-#                     prediction_latency=SystemLoggingGroup(enable=True),
-#                 ),
-#             ),
-#             EndpointConfig(
-#                 task="question_answering",
-#                 model="default",
-#                 route="/predict_question_answering",
-#                 logging_config=PipelineSystemLoggingConfig(
-#                     inference_details=SystemLoggingGroup(enable=True),
-#                     prediction_latency=SystemLoggingGroup(enable=True),
-#                 ),
-#             ),
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine:
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#     client.post("/predict_text_classification", json=dict(sequences="asdf"))
-#     client.post(
-#         "/predict_text_classification", json=dict(question="asdf", context="asdf")
-#     )
-#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-#
-#     c = Counter([call.split(",")[0] for call in calls])
-#
-#     assert c == SAMPLE_LOGS_DICT
+
+def test_data_logging_from_predefined():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name="text_classification",
+                model=stub,
+                add_predefined=[MetricFunctionConfig(func="text_classification")],
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+    client.post(
+        "/predict",
+        json={
+            "sequences": [["Fun for adults and children.", "Fun for only children."]]
+        },
+    )
+    calls = fetch_leaf_logger(server_logger).calls
+    data_logging_logs = [call for call in calls if "DATA" in call]
+    with open(
+        "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
+        "r",
+    ) as f:
+        expected_logs = f.read().splitlines()
+    for log, expected_log in zip(data_logging_logs, expected_logs):
+        assert log == expected_log
+
+
+@flaky(max_runs=4, min_passes=3)
+def test_logging_only_system_info():
+    server_config = ServerConfig(
+        endpoints=[EndpointConfig(task=task, name=name, model=stub)],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    _test_logger_contents(
+        fetch_leaf_logger(server_logger),
+        {"prediction_latency": 8},
+    )
+
+
+def test_regex_target_logging():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name=name,
+                data_logging={
+                    "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
+                },
+                model=stub,
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    _test_logger_contents(
+        fetch_leaf_logger(server_logger),
+        {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
+    )
+
+
+def test_multiple_targets_logging():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name=name,
+                data_logging={
+                    "pipeline_inputs.sequences": [
+                        MetricFunctionConfig(func="identity")
+                    ],
+                    "engine_inputs": [MetricFunctionConfig(func="identity")],
+                },
+                model=stub,
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    _test_logger_contents(
+        fetch_leaf_logger(server_logger),
+        {
+            "pipeline_inputs.sequences__identity": 2,
+            "engine_inputs__identity": 2,
+            "prediction_latency": 8,
+        },
+    )
+
+
+@flaky(max_runs=3, min_passes=2)
+def test_function_metric_with_target_loggers():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name=name,
+                data_logging={
+                    "pipeline_inputs.sequences[0]": [
+                        MetricFunctionConfig(
+                            func="identity", target_loggers=["logger_1"]
+                        )
+                    ],
+                    "engine_inputs": [MetricFunctionConfig(func="identity")],
+                },
+                model=stub,
+            )
+        ],
+        loggers={
+            "logger_1": {"path": logger_identifier},
+            "logger_2": {"path": logger_identifier},
+        },
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+
+    _test_logger_contents(
+        server_logger.logger.loggers[1].logger.loggers[0],
+        {
+            "pipeline_inputs.sequences__identity": 2,
+            "engine_inputs__identity": 2,
+            "prediction_latency": 8,
+        },
+    )
+    _test_logger_contents(
+        server_logger.logger.loggers[1].logger.loggers[1],
+        {
+            "pipeline_inputs.sequences__identity": 0,
+            "engine_inputs__identity": 2,
+            "prediction_latency": 8,
+        },
+    )
+
+
+@mock_engine(rng_seed=0)
+def test_instantiate_prometheus(mock_engine, tmp_path):
+    client = TestClient(
+        _build_app(
+            ServerConfig(
+                endpoints=[EndpointConfig(task="text_classification", model="default")],
+                loggers=dict(
+                    prometheus={
+                        "port": find_free_port(),
+                        "text_log_save_dir": tmp_path.name,
+                        "text_log_save_frequency": 30,
+                    }
+                ),
+            )
+        )
+    )
+    r = client.post("/predict", json=dict(sequences="asdf"))
+    assert r.status_code == 200
+    shutil.rmtree(tmp_path.name, ignore_errors=True)
+
+
+@mock_engine(rng_seed=0)
+def test_endpoint_system_logging(mock_engine):
+    server_config = ServerConfig(
+        system_logging=ServerSystemLoggingConfig(
+            request_details=SystemLoggingGroup(enable=True),
+            resource_utilization=SystemLoggingGroup(enable=True),
+        ),
+        endpoints=[
+            EndpointConfig(
+                task="text_classification",
+                model="default",
+                route="/predict_text_classification",
+                logging_config=PipelineSystemLoggingConfig(
+                    inference_details=SystemLoggingGroup(enable=True),
+                    prediction_latency=SystemLoggingGroup(enable=True),
+                ),
+            ),
+            EndpointConfig(
+                task="question_answering",
+                model="default",
+                route="/predict_question_answering",
+                logging_config=PipelineSystemLoggingConfig(
+                    inference_details=SystemLoggingGroup(enable=True),
+                    prediction_latency=SystemLoggingGroup(enable=True),
+                ),
+            ),
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine:
+        app = _build_app(server_config)
+    client = TestClient(app)
+    client.post("/predict_text_classification", json=dict(sequences="asdf"))
+    client.post(
+        "/predict_text_classification", json=dict(question="asdf", context="asdf")
+    )
+    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+
+    c = Counter([call.split(",")[0] for call in calls])
+
+    assert c == SAMPLE_LOGS_DICT
diff --git a/tests/server/test_system_logging.py b/tests/server/test_system_logging.py
index bd0a8a3ae3..b6a3fbd2b6 100644
--- a/tests/server/test_system_logging.py
+++ b/tests/server/test_system_logging.py
@@ -1,169 +1,169 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# from unittest import mock
-#
-# import pytest
-# from deepsparse.loggers.config import SystemLoggingGroup
-# from deepsparse.server.config import (
-#     EndpointConfig,
-#     ServerConfig,
-#     ServerSystemLoggingConfig,
-# )
-# from deepsparse.server.helpers import server_logger_from_config
-# from deepsparse.server.server import _build_app
-# from deepsparse.server.system_logging import log_resource_utilization
-# from fastapi.testclient import TestClient
-# from tests.deepsparse.loggers.helpers import ListLogger
-# from tests.utils import mock_engine
-#
-#
-# logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
-# stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
-# task = "text-classification"
-# name = "endpoint_name"
-#
-#
-# def _test_successful_requests(calls, successful_request):
-#     relevant_call = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:request_details/successful_request_count")
-#     ]
-#     assert len(relevant_call) == 1
-#     relevant_call = relevant_call[0]
-#     value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
-#     assert value == successful_request
-#
-#
-# def _test_response_msg(calls, response_msg):
-#     relevant_call = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:request_details/response_message")
-#     ]
-#     assert len(relevant_call) == 1
-#     relevant_call = relevant_call[0]
-#     value = relevant_call.split("value:")[1].split(",")[0]
-#     assert value == response_msg
-#
-#
-# @pytest.mark.parametrize(
-#     "json_payload, input_batch_size, successful_request, response_msg",
-#     [
-#         ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
-#         (
-#             {"sequences": ["today is great", "today is great"]},
-#             2,
-#             True,
-#             "Response status code: 200",
-#         ),
-#         ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
-#     ],
-# )
-# def test_log_request_details(
-#     json_payload, input_batch_size, successful_request, response_msg
-# ):
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task, name=name, model=stub, batch_size=input_batch_size
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#         system_logging=ServerSystemLoggingConfig(
-#             request_details=SystemLoggingGroup(enable=True)
-#         ),
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#     client.post("/predict", json=json_payload)
-#
-#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-#
-#     _test_successful_requests(calls, successful_request)
-#     _test_response_msg(calls, response_msg)
-#
-#
-# def _test_cpu_utilization(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
-#     ]
-#     assert len(relevant_calls) == num_iterations
-#
-#
-# def _test_memory_utilization(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:resource_utilization/memory_utilization_percent")
-#     ]
-#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-#     assert len(relevant_calls) == num_iterations
-#     # memory utilization is a percentage, so it should be between 0 and 100
-#     assert all(0.0 < value < 100.0 for value in values)
-#
-#
-# def _test_total_memory_available(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith(
-#             "identifier:resource_utilization/total_memory_available_bytes"
-#         )
-#     ]
-#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-#     assert len(relevant_calls) == num_iterations
-#     # assert all values are the same (total memory available is constant)
-#     assert all(value == values[0] for value in values)
-#
-#
-# def _test_additional_items_to_log(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:resource_utilization/test")
-#     ]
-#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-#     assert len(relevant_calls) == num_iterations
-#     # assert all values are the same ({"test" : 1} is constant)
-#     assert all(value == 1 for value in values)
-#
-#
-# @pytest.mark.parametrize(
-#     "num_iterations, additional_items_to_log",
-#     [
-#         (5, {}),
-#         (3, {"test": 1}),
-#     ],
-# )
-# def test_log_resource_utilization(num_iterations, additional_items_to_log):
-#     server_logger = ListLogger()
-#
-#     for iter in range(num_iterations):
-#         log_resource_utilization(
-#             server_logger, prefix="resource_utilization", **additional_items_to_log
-#         )
-#
-#     calls = server_logger.calls
-#
-#     _test_cpu_utilization(calls, num_iterations)
-#     _test_memory_utilization(calls, num_iterations)
-#     _test_total_memory_available(calls, num_iterations)
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest import mock
+
+import pytest
+from deepsparse.loggers.config import SystemLoggingGroup
+from deepsparse.server.config import (
+    EndpointConfig,
+    ServerConfig,
+    ServerSystemLoggingConfig,
+)
+from deepsparse.server.helpers import server_logger_from_config
+from deepsparse.server.server import _build_app
+from deepsparse.server.system_logging import log_resource_utilization
+from fastapi.testclient import TestClient
+from tests.deepsparse.loggers.helpers import ListLogger
+from tests.utils import mock_engine
+
+
+logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
+stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
+task = "text-classification"
+name = "endpoint_name"
+
+
+def _test_successful_requests(calls, successful_request):
+    relevant_call = [
+        call
+        for call in calls
+        if call.startswith("identifier:request_details/successful_request_count")
+    ]
+    assert len(relevant_call) == 1
+    relevant_call = relevant_call[0]
+    value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
+    assert value == successful_request
+
+
+def _test_response_msg(calls, response_msg):
+    relevant_call = [
+        call
+        for call in calls
+        if call.startswith("identifier:request_details/response_message")
+    ]
+    assert len(relevant_call) == 1
+    relevant_call = relevant_call[0]
+    value = relevant_call.split("value:")[1].split(",")[0]
+    assert value == response_msg
+
+
+@pytest.mark.parametrize(
+    "json_payload, input_batch_size, successful_request, response_msg",
+    [
+        ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
+        (
+            {"sequences": ["today is great", "today is great"]},
+            2,
+            True,
+            "Response status code: 200",
+        ),
+        ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
+    ],
+)
+def test_log_request_details(
+    json_payload, input_batch_size, successful_request, response_msg
+):
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task, name=name, model=stub, batch_size=input_batch_size
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+        system_logging=ServerSystemLoggingConfig(
+            request_details=SystemLoggingGroup(enable=True)
+        ),
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+    client.post("/predict", json=json_payload)
+
+    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+
+    _test_successful_requests(calls, successful_request)
+    _test_response_msg(calls, response_msg)
+
+
+def _test_cpu_utilization(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
+    ]
+    assert len(relevant_calls) == num_iterations
+
+
+def _test_memory_utilization(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith("identifier:resource_utilization/memory_utilization_percent")
+    ]
+    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+    assert len(relevant_calls) == num_iterations
+    # memory utilization is a percentage, so it should be between 0 and 100
+    assert all(0.0 < value < 100.0 for value in values)
+
+
+def _test_total_memory_available(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith(
+            "identifier:resource_utilization/total_memory_available_bytes"
+        )
+    ]
+    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+    assert len(relevant_calls) == num_iterations
+    # assert all values are the same (total memory available is constant)
+    assert all(value == values[0] for value in values)
+
+
+def _test_additional_items_to_log(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith("identifier:resource_utilization/test")
+    ]
+    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+    assert len(relevant_calls) == num_iterations
+    # assert all values are the same ({"test" : 1} is constant)
+    assert all(value == 1 for value in values)
+
+
+@pytest.mark.parametrize(
+    "num_iterations, additional_items_to_log",
+    [
+        (5, {}),
+        (3, {"test": 1}),
+    ],
+)
+def test_log_resource_utilization(num_iterations, additional_items_to_log):
+    server_logger = ListLogger()
+
+    for iter in range(num_iterations):
+        log_resource_utilization(
+            server_logger, prefix="resource_utilization", **additional_items_to_log
+        )
+
+    calls = server_logger.calls
+
+    _test_cpu_utilization(calls, num_iterations)
+    _test_memory_utilization(calls, num_iterations)
+    _test_total_memory_available(calls, num_iterations)

From b1cf01be8b0a8a8d25aab09acab3b9d0b9dc2767 Mon Sep 17 00:00:00 2001
From: dbogunowicz <damian@neuralmagic.com>
Date: Thu, 2 Mar 2023 11:35:52 +0000
Subject: [PATCH 05/68] initial commit

---
 src/deepsparse/transformers/pipelines/text_generation.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/deepsparse/transformers/pipelines/text_generation.py

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
new file mode 100644
index 0000000000..e69de29bb2

From 0a3f48dad8fe1dc30dd80f03d9753ed2b1ed7d38 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Thu, 16 Mar 2023 14:06:04 +0100
Subject: [PATCH 06/68] [Codegen][ORT][Static Seq Length]
 TextGenerationPipeline (#946)

* initial commit

* coreys simplifications

* finishing the second model static

* ready, time for beautification

* ready for review

* moved the code to examples

* fix eos logic

* add argument num_tokens_to_generate
---
 examples/codegen/README.md                    |  30 ++
 examples/codegen/text_generation.py           | 494 ++++++++++++++++++
 .../transformers/pipelines/pipeline.py        |  14 +-
 .../transformers/pipelines/text_generation.py |   0
 4 files changed, 533 insertions(+), 5 deletions(-)
 create mode 100644 examples/codegen/README.md
 create mode 100644 examples/codegen/text_generation.py
 delete mode 100644 src/deepsparse/transformers/pipelines/text_generation.py

diff --git a/examples/codegen/README.md b/examples/codegen/README.md
new file mode 100644
index 0000000000..d855a5e075
--- /dev/null
+++ b/examples/codegen/README.md
@@ -0,0 +1,30 @@
+<!--
+Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+Example of how to run the pipeline:
+
+```python
+from examples.codegen.text_generation import TextGenerationPipeline
+
+codegen = TextGenerationPipeline(
+    model_path="/network/damian/static-codegen-350M-multi",
+    engine_type="onnxruntime",
+    sequence_length=128, )
+
+out = codegen(sequences=["def hello_world():", "def fibonacci(x):"])
+for seq in out.sequences:
+    print(seq)
+```
\ No newline at end of file
diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
new file mode 100644
index 0000000000..1812c9ef93
--- /dev/null
+++ b/examples/codegen/text_generation.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from tempfile import NamedTemporaryFile
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import numpy
+import onnx
+from pydantic import BaseModel, Field
+
+from deepsparse import Context, MultiModelEngine, Pipeline
+from deepsparse.pipeline import (
+    DEEPSPARSE_ENGINE,
+    ORT_ENGINE,
+    SUPPORTED_PIPELINE_ENGINES,
+    Engine,
+    ORTEngine,
+)
+from deepsparse.transformers.pipelines import TransformersPipeline
+from scipy.special import softmax
+
+
+_MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
+_MODEL_DIR_ONNX_NAME = "model.onnx"
+
+__all__ = ["TextGenerationPipeline"]
+
+
+def overwrite_transformer_onnx_model_inputs(
+    path: str,
+    batch_size: int = 1,
+    max_length: int = 128,
+    output_path: Optional[str] = None,
+) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
+    """
+    Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
+    Assumes that these are the first and second shape indices of the given model inputs
+    respectively
+
+    :param path: path to the ONNX model to override
+    :param batch_size: batch size to set
+    :param max_length: max sequence length to set
+    :param output_path: if provided, the model will be saved to the given path,
+        otherwise, the model will be saved to a named temporary file that will
+        be deleted after the program exits
+    :return: if no output path, a tuple of the saved path to the model, list of
+        model input names, and reference to the tempfile object will be returned
+        otherwise, only the model input names will be returned
+    """
+    # overwrite input shapes
+    model = onnx.load(path)
+    initializer_input_names = set([node.name for node in model.graph.initializer])
+    external_inputs = [
+        inp for inp in model.graph.input if inp.name not in initializer_input_names
+    ]
+    input_names = []
+    for external_input in external_inputs:
+        # this is removed for now (will need to be accounted for when we start
+        # supporting deepsparse engine
+        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        input_names.append(external_input.name)
+
+    # Save modified model
+    if output_path is None:
+        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
+        onnx.save(model, tmp_file.name)
+
+        return tmp_file.name, input_names, tmp_file
+    else:
+        onnx.save(model, output_path)
+        return input_names
+
+
+class TextGenerationInput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input sequence(s) to generate "
+        "text from. If a string is provided, "
+        "the model will generate text from the "
+        "provided sequence. If a list of strings "
+        "is provided, the model will "
+        "generate text from each sequence in the list.",
+    )
+
+
+class TextGenerationOutput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input text sequence(s) appended with "
+        "the generated text sequence(s). "
+        "If a string was provided as input, "
+        "a string will be returned. "
+        "If a list of strings was provided as "
+        "input, a list of strings will be returned.",
+    )
+
+
+@Pipeline.register(
+    task="text_generation",
+    task_aliases=["codegen"],
+)
+class TextGenerationPipeline(TransformersPipeline):
+    """
+    Pipeline for text generation tasks.
+
+    :param deterministic: if True, the pipeline will sample from
+        the probability distribution computed from the logits.
+        If False, the pipeline will get the next token by applying
+        an argmax function to the logits.
+    :param sampling_temperature: the temperature to use when sampling
+        from the probability distribution computed from the logits.
+        Higher values will result in more random samples.
+    :param num_tokens_to_generate: the number of tokens to generate
+        given the input sequence. If None, the model will generate
+        tokens until the end of the sequence is reached.
+    :param kwargs: kwargs to pass to the TransformersPipeline
+    """
+
+    def __init__(
+        self,
+        deterministic: bool = True,
+        sampling_temperature: float = 1.0,
+        num_tokens_to_generate: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.deterministic = deterministic
+        self.sampling_temperature = sampling_temperature
+        self.num_tokens_to_generate = num_tokens_to_generate
+
+        # set-up the auxiliary multitoken model
+        self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
+        # initialize the auxiliary multitoken engine
+        self.multitoken_engine = self._initialize_multitoken_engine()
+
+        # re-initialize the target model
+        # this will be removed once codegen is productionized
+        self.onnx_path = self._setup_onnx_file_path()
+        self.engine = self._reinitialize_engine()
+
+        if self._batch_size != 1:
+            raise ValueError(
+                "For the sake of simplicity, only dynamic"
+                "batch shape is supported for now. "
+                "Set `batch_size` to 1 or None."
+            )
+
+    @staticmethod
+    def route_input_to_bucket(
+        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
+    ) -> Pipeline:
+        """
+        This method is used to route the input to the correct pipeline.
+
+        :param args: args to pass to the pipeline
+        :param input_schema: the input schema for the pipeline
+        :param pipelines: the list of pipelines to route the input to
+        :param kwargs: kwargs to pass to the pipeline
+        :return: the pipeline to route the input to
+        """
+        raise NotImplementedError
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the input schema for the pipeline.
+
+        :return: the input schema for the pipeline
+        """
+        return TextGenerationInput
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the output schema for the pipeline.
+
+        :return: the output schema for the pipeline
+        """
+        return TextGenerationOutput
+
+    def process_engine_outputs(
+        self, engine_outputs: List[numpy.ndarray], **kwargs
+    ) -> BaseModel:
+        """
+        Convert the engine outputs to the output schema for the pipeline.
+
+        :param engine_outputs: the outputs from the engine
+        :return: the output schema for the pipeline
+        """
+        sequences = self.tokenizer.batch_decode(
+            engine_outputs[0], skip_special_tokens=True
+        )
+        return TextGenerationOutput(sequences=sequences)
+
+    def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
+        """
+        Convert the input schema for the pipeline to the inputs for the engine.
+
+        :param inputs: the input schema for the pipeline
+        :return: the inputs for the engine
+        """
+        sequences = inputs.sequences
+
+        if isinstance(sequences, List) and all(
+            isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
+        ):
+            # if batch items contain only one sequence but are wrapped in lists, unwrap
+            # for use as tokenizer input
+            sequences = [sequence[0] for sequence in sequences]
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        input_tokens = self.tokenizer(
+            sequences,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding="max_length",
+        )
+
+        engine_input = self.tokens_to_engine_input(
+            input_tokens, onnx_input_names=self.multitoken_engine._input_names
+        )
+
+        return engine_input
+
+    def engine_forward(
+        self, engine_inputs: List[numpy.ndarray], **kwargs
+    ) -> numpy.ndarray:
+        """
+        :param engine_inputs: list of numpy inputs to
+            Pipeline engine forward pass
+        :return: A numpy array that contains the tokens generated by the model
+        """
+
+        # list of the meaningful tokens in the sequence
+        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
+
+        tokens, kv_cache = self.initial_autoregressive_pass(
+            tokens=tokens, engine_inputs=engine_inputs
+        )
+
+        # establish the number of autoregressive passes to perform
+        num_iterations = self.sequence_length - len(tokens)
+        if self.num_tokens_to_generate:
+            if self.num_tokens_to_generate > num_iterations:
+                raise ValueError(
+                    f"Num_tokens_to_generate ({self.num_tokens_to_generate}) "
+                    f"cannot be greater than sequence_length ({self.sequence_length}) "
+                    f"minus the number of tokens in the input sequence ({len(tokens)})."
+                )
+            num_iterations = self.num_tokens_to_generate
+
+        # perform the remaining autoregressive passes
+        for iter in range(num_iterations):
+            eos_token_found = self.tokenizer.eos_token_id == tokens[-1]
+            if eos_token_found:
+                # fill the token list so that it has the correct sequence length
+                tokens = tokens + [self.tokenizer.pad_token_id] * (
+                    self.sequence_length - len(tokens)
+                )
+                return numpy.array([[tokens]])
+
+            tokens, kv_cache = self.autoregressive_pass(
+                tokens=tokens,
+                kv_cache=kv_cache,
+            )
+
+        # fill the token list so that it has the correct sequence length
+        tokens = tokens + [self.tokenizer.pad_token_id] * (
+            self.sequence_length - len(tokens)
+        )
+        return numpy.array([[tokens]])
+
+    def autoregressive_pass(
+        self,
+        tokens: List[int],
+        kv_cache: Dict[str, numpy.ndarray],
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        Performs an autoregressive pass to generate the next token in the sequence
+        and update the kv_cache with the new key/value pairs.
+
+        1)  Set the attention mask to 1 for the tokens that are already in the sequence
+            and 1 for the `new_token` - at the last position. This is because the
+            `new_token`'s key/value will be added to the set of keys/values
+            at the last position (before being fed to an attention block)
+        2)  Set up the engine inputs
+        3)  Run the engine forward pas
+        4)  Preprocesses the kv cache so that it can be used as input to the next
+            autoregressive pass.
+        5)  Returns the new token sequence and the updated kv cache.
+
+        :param tokens: the current token sequence
+        :param kv_cache: the current kv_cache
+        :return: the new token sequence and the updated kv cache
+        """
+
+        new_token = tokens[-1]
+
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        attention_mask[:, : len(tokens)] = 1
+        attention_mask[:, -1] = 1
+
+        engine_inputs_dict = {
+            "input_ids": numpy.array([[new_token]]),
+            "attention_mask": attention_mask,
+        }
+        engine_inputs_dict.update(kv_cache)
+
+        engine_inputs = [engine_inputs_dict[name] for name in self.engine._input_names]
+
+        new_logits, *new_kvs = self.engine(engine_inputs)
+
+        # rename the output names to match the names expected
+        # in the next autoregressive pass
+        kv_output_names = [
+            name.replace("present", "past_key_values")
+            for name in self.engine._output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(kv_output_names, new_kvs))
+        for k, v in kv_cache.items():
+            v[:, :, len(tokens) - 1] = v[:, :, -1]
+            kv_cache[k] = numpy.ascontiguousarray(v[:, :, :-1])
+
+        # Obtain the next token from the logits
+        new_token = TextGenerationPipeline.sample_new_token(
+            logits=new_logits[0, -1, :],
+            deterministic=self.deterministic,
+            temperature=self.sampling_temperature,
+        )
+        tokens.append(new_token)
+
+        return tokens, kv_cache
+
+    def initial_autoregressive_pass(
+        self,
+        tokens: List[int],
+        engine_inputs: List[numpy.ndarray],
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        Performs a single autoregressive pass to initialize the key, value cache.
+
+        1)  Obtains logits and kv cache for the input sequence.
+            From logits, obtains the next token.
+        2)  Preprocesses the kv cache so that it can be used as input to the next
+            autoregressive pass.
+        3)  Returns the new token sequence and the updated kv cache.
+
+        :param tokens: input tokens provided by the user
+        :param engine_inputs: list of numpy inputs to Pipeline
+            engine forward pass
+        :return: the extended token sequence and the kv cache
+        """
+
+        past_logits, *new_kvs = self.multitoken_engine(engine_inputs)
+
+        # rename the output names to match the names expected
+        # in the next autoregressive pass
+        kv_output_names = [
+            name.replace("present", "past_key_values")
+            for name in self.multitoken_engine._output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(kv_output_names, new_kvs))
+        for k, v in kv_cache.items():
+            # remove the information about the `new_token` from the cache
+            v = v[:, :, :-1]
+            # zero out all the info that does not pertain to the
+            # "seen" `token` sequence
+            v[:, :, len(tokens) :] = 0.0
+            kv_cache[k] = numpy.ascontiguousarray(v)
+
+        # Obtain the next token from the logits
+        new_token = TextGenerationPipeline.sample_new_token(
+            logits=past_logits[0, len(tokens) - 1],
+            deterministic=self.deterministic,
+            temperature=self.sampling_temperature,
+        )
+        tokens.append(new_token)
+
+        return tokens, kv_cache
+
+    @staticmethod
+    def sample_new_token(
+        logits: numpy.ndarray, deterministic: bool, temperature: float
+    ) -> int:
+        """
+        Samples a token from the logits using the sampling temperature.
+
+        :param logits: the logits from the model
+        :param deterministic: whether to sample from the softmax or take the argmax
+        :param temperature: the sampling temperature
+
+        :return: the sampled token
+        """
+        if deterministic:
+            return numpy.argmax(logits)
+        else:
+            logits /= temperature
+            probs = softmax(logits)
+            return numpy.random.choice(len(probs), p=probs)
+
+    def _setup_multitoken_onnx_file_path(self) -> str:
+        # `setup_onnx_file_path` function rewritten
+        # to setup the multitoken_onnx_file_path
+
+        multitoken_onnx_path = os.path.join(
+            self.model_path, _MODEL_DIR_ONNX_MULTI_TOKEN_NAME
+        )
+        (
+            multitoken_onnx_path,
+            self.multitoken_onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            multitoken_onnx_path, max_length=self.sequence_length
+        )
+
+        return multitoken_onnx_path
+
+    def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
+        # `_initialize_engine` function rewritten
+        # to initialize the multitoken_engine
+
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=self.onnx_multitoken_path,
+                    **self._engine_args,
+                )
+            return Engine(self.onnx_multitoken_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_multitoken_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )
+
+    def _setup_onnx_file_path(self) -> str:
+        # `setup_onnx_file_path` function rewritten
+
+        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path, max_length=self.sequence_length
+        )
+
+        return onnx_path
+
+    def _initialize_engine(self):
+        return None
+
+    def _reinitialize_engine(self) -> Union[Engine, ORTEngine]:
+        # `_initialize_engine` function rewritten
+
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=self.onnx_path,
+                    **self._engine_args,
+                )
+            return Engine(self.onnx_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index 38073e260f..16603b6950 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -109,7 +109,8 @@ def setup_onnx_file_path(self) -> str:
             config_path, finetuning_task=self.task if hasattr(self, "task") else None
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path, model_max_length=self.sequence_length
+            tokenizer_path,
+            model_max_length=self.sequence_length,
         )
         self.config_path = os.path.join(config_path, "config.json")
         self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
@@ -126,19 +127,22 @@ def setup_onnx_file_path(self) -> str:
         return onnx_path
 
     def tokens_to_engine_input(
-        self, tokens: Mapping[Any, numpy.ndarray]
+        self,
+        tokens: Mapping[Any, numpy.ndarray],
+        onnx_input_names: Optional[List[str]] = None,
     ) -> List[numpy.ndarray]:
         """
         :param tokens: outputs of the pipeline tokenizer
         :return: list of numpy arrays in expected order for model input
         """
-        if not all(name in tokens for name in self.onnx_input_names):
+        onnx_input_names = onnx_input_names or self.onnx_input_names
+        if not all(name in tokens for name in onnx_input_names):
             raise ValueError(
-                f"pipeline expected arrays with names {self.onnx_input_names}, "
+                f"pipeline expected arrays with names {onnx_input_names}, "
                 f"received inputs: {list(tokens.keys())}"
             )
 
-        return [tokens[name] for name in self.onnx_input_names]
+        return [tokens[name] for name in onnx_input_names]
 
     @staticmethod
     def should_bucket(*args, **kwargs) -> bool:
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
deleted file mode 100644
index e69de29bb2..0000000000

From add4625a2a0ba99bf3e87c5b37c04153cf71c53e Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:40:08 +0100
Subject: [PATCH 07/68] [CodeGen][Documentation] (#956)

* initial commit

* coreys simplifications

* finishing the second model static

* ready, time for beautification

* ready for review

* moved the code to examples

* fix eos logic

* add argument num_tokens_to_generate

* initial commit

* change order

* Update examples/codegen/README.md

Co-authored-by: corey-nm <109536191+corey-nm@users.noreply.github.com>

---------

Co-authored-by: corey-nm <109536191+corey-nm@users.noreply.github.com>
---
 examples/codegen/README.md | 81 +++++++++++++++++++++++++++++++++++---
 1 file changed, 75 insertions(+), 6 deletions(-)

diff --git a/examples/codegen/README.md b/examples/codegen/README.md
index d855a5e075..1e47c01f7b 100644
--- a/examples/codegen/README.md
+++ b/examples/codegen/README.md
@@ -14,7 +14,43 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-Example of how to run the pipeline:
+## ONNX Export
+Firstly, we need to install HuggingFace optimum library
+```bash
+pip install optimum
+```
+
+### Patch the original PyTorch Model
+First apply the following modification to this file in your transformers installation:
+https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py#L212
+
+\```diff
+-offset = layer_past[0].shape[-2]
++offset = (attention_mask[0] == 0.0).sum() - 1.0
+\```
+
+We need to do this because the existing with_past implementations assume there is no padding in the inputs. With deepsparse, we need to use static sequence length, which means our offset for the embeddings will depend on how many non-padded inputs we receive.
+
+The new line checks this with the attention_mask. At this point in the code, attention_mask has been transformed from a tensor with 0s and 1s, to a tensor of `float.min` and `0.0`. So when we compare `attention_mask == 0.0` we are actually saying everywhere the attention_mask is 1.
+
+We also need to subtract 1 from this count, because the attention mask is applied AFTER the kv cache is concatenated to the new token, which means the attention mask will actually be sequence length + 1 items. So we subtract 1 to get the current sequence length.
+
+### Export the model to ONNX
+
+```bash
+optimum-cli export onnx --model Salesforce/codegen-350M-multi codegen-350M-multi
+```
+This saves the model to directory `codegen-350-multi`
+
+### Updating Model's Inputs Outputs Dimension Sizes 
+TODO
+
+## Running in the DeepSparse Pipeline
+
+First, we need to rename `decoder_with_past_model.onnx` to `model.onnx` inside
+the `static-codegen-350-multi`, to abide the naming convention
+
+Finally, run the pipeline:
 
 ```python
 from examples.codegen.text_generation import TextGenerationPipeline
@@ -22,9 +58,42 @@ from examples.codegen.text_generation import TextGenerationPipeline
 codegen = TextGenerationPipeline(
     model_path="/network/damian/static-codegen-350M-multi",
     engine_type="onnxruntime",
-    sequence_length=128, )
+    sequence_length=128)
+
+out = codegen(sequences="def hello_world():")
+print(out.sequences[0])
+```
+
+```bash
+def hello_world():
+    return 'Hello World!'
+
+def hello_world_2():
+    return 'Hello World!'
+
+def hello_world_3():
+    return 'Hello World!'
+
+def hello_world_4():
+    return 'Hello World!'
+
+def hello_world_5():
+    return 'Hello World!'
+
+def hello_world_6():
+    return 'Hello World!'
+
+def hello_world_7():
+    return 'Hello World!'
+
+def hello_world_8():
+    return 'Hello World!'
+
+def hello
+```
 
-out = codegen(sequences=["def hello_world():", "def fibonacci(x):"])
-for seq in out.sequences:
-    print(seq)
-```
\ No newline at end of file
+Modifying pipeline behaviour:
+1. By adding argument `deterministic=False`, the next token of the sequence will not be chosen deterministically (using argmax), but will be
+sampled from the probablility distribution.
+2. By setting `sampling_temperature` when `deterministic=False`, we are allowing more or less randomness in the sampling method (https://towardsdatascience.com/how-to-sample-from-language-models-682bceb97277)
+3. By setting `num_tokens_to_generate`, we strictly specify how many tokens we want to generate per input.

From 22d27465b7bcb576c8cb383a0d4b34b720fa32e9 Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Mon, 8 May 2023 10:47:11 -0400
Subject: [PATCH 08/68] reimplementation for generative pipelines

---
 examples/codegen/text_generation.py         | 494 --------------------
 src/deepsparse/pipeline.py                  |  52 ++-
 src/deepsparse/pipelines/text_generation.py | 316 +++++++++++++
 3 files changed, 348 insertions(+), 514 deletions(-)
 delete mode 100644 examples/codegen/text_generation.py
 create mode 100644 src/deepsparse/pipelines/text_generation.py

diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
deleted file mode 100644
index 1812c9ef93..0000000000
--- a/examples/codegen/text_generation.py
+++ /dev/null
@@ -1,494 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from tempfile import NamedTemporaryFile
-from typing import Dict, List, Optional, Tuple, Type, Union
-
-import numpy
-import onnx
-from pydantic import BaseModel, Field
-
-from deepsparse import Context, MultiModelEngine, Pipeline
-from deepsparse.pipeline import (
-    DEEPSPARSE_ENGINE,
-    ORT_ENGINE,
-    SUPPORTED_PIPELINE_ENGINES,
-    Engine,
-    ORTEngine,
-)
-from deepsparse.transformers.pipelines import TransformersPipeline
-from scipy.special import softmax
-
-
-_MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
-_MODEL_DIR_ONNX_NAME = "model.onnx"
-
-__all__ = ["TextGenerationPipeline"]
-
-
-def overwrite_transformer_onnx_model_inputs(
-    path: str,
-    batch_size: int = 1,
-    max_length: int = 128,
-    output_path: Optional[str] = None,
-) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
-    """
-    Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
-    Assumes that these are the first and second shape indices of the given model inputs
-    respectively
-
-    :param path: path to the ONNX model to override
-    :param batch_size: batch size to set
-    :param max_length: max sequence length to set
-    :param output_path: if provided, the model will be saved to the given path,
-        otherwise, the model will be saved to a named temporary file that will
-        be deleted after the program exits
-    :return: if no output path, a tuple of the saved path to the model, list of
-        model input names, and reference to the tempfile object will be returned
-        otherwise, only the model input names will be returned
-    """
-    # overwrite input shapes
-    model = onnx.load(path)
-    initializer_input_names = set([node.name for node in model.graph.initializer])
-    external_inputs = [
-        inp for inp in model.graph.input if inp.name not in initializer_input_names
-    ]
-    input_names = []
-    for external_input in external_inputs:
-        # this is removed for now (will need to be accounted for when we start
-        # supporting deepsparse engine
-        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
-        input_names.append(external_input.name)
-
-    # Save modified model
-    if output_path is None:
-        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-        onnx.save(model, tmp_file.name)
-
-        return tmp_file.name, input_names, tmp_file
-    else:
-        onnx.save(model, output_path)
-        return input_names
-
-
-class TextGenerationInput(BaseModel):
-    sequences: Union[str, List[str]] = Field(
-        description="The input sequence(s) to generate "
-        "text from. If a string is provided, "
-        "the model will generate text from the "
-        "provided sequence. If a list of strings "
-        "is provided, the model will "
-        "generate text from each sequence in the list.",
-    )
-
-
-class TextGenerationOutput(BaseModel):
-    sequences: Union[str, List[str]] = Field(
-        description="The input text sequence(s) appended with "
-        "the generated text sequence(s). "
-        "If a string was provided as input, "
-        "a string will be returned. "
-        "If a list of strings was provided as "
-        "input, a list of strings will be returned.",
-    )
-
-
-@Pipeline.register(
-    task="text_generation",
-    task_aliases=["codegen"],
-)
-class TextGenerationPipeline(TransformersPipeline):
-    """
-    Pipeline for text generation tasks.
-
-    :param deterministic: if True, the pipeline will sample from
-        the probability distribution computed from the logits.
-        If False, the pipeline will get the next token by applying
-        an argmax function to the logits.
-    :param sampling_temperature: the temperature to use when sampling
-        from the probability distribution computed from the logits.
-        Higher values will result in more random samples.
-    :param num_tokens_to_generate: the number of tokens to generate
-        given the input sequence. If None, the model will generate
-        tokens until the end of the sequence is reached.
-    :param kwargs: kwargs to pass to the TransformersPipeline
-    """
-
-    def __init__(
-        self,
-        deterministic: bool = True,
-        sampling_temperature: float = 1.0,
-        num_tokens_to_generate: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.deterministic = deterministic
-        self.sampling_temperature = sampling_temperature
-        self.num_tokens_to_generate = num_tokens_to_generate
-
-        # set-up the auxiliary multitoken model
-        self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
-        # initialize the auxiliary multitoken engine
-        self.multitoken_engine = self._initialize_multitoken_engine()
-
-        # re-initialize the target model
-        # this will be removed once codegen is productionized
-        self.onnx_path = self._setup_onnx_file_path()
-        self.engine = self._reinitialize_engine()
-
-        if self._batch_size != 1:
-            raise ValueError(
-                "For the sake of simplicity, only dynamic"
-                "batch shape is supported for now. "
-                "Set `batch_size` to 1 or None."
-            )
-
-    @staticmethod
-    def route_input_to_bucket(
-        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
-    ) -> Pipeline:
-        """
-        This method is used to route the input to the correct pipeline.
-
-        :param args: args to pass to the pipeline
-        :param input_schema: the input schema for the pipeline
-        :param pipelines: the list of pipelines to route the input to
-        :param kwargs: kwargs to pass to the pipeline
-        :return: the pipeline to route the input to
-        """
-        raise NotImplementedError
-
-    @property
-    def input_schema(self) -> Type[BaseModel]:
-        """
-        Property to return the input schema for the pipeline.
-
-        :return: the input schema for the pipeline
-        """
-        return TextGenerationInput
-
-    @property
-    def output_schema(self) -> Type[BaseModel]:
-        """
-        Property to return the output schema for the pipeline.
-
-        :return: the output schema for the pipeline
-        """
-        return TextGenerationOutput
-
-    def process_engine_outputs(
-        self, engine_outputs: List[numpy.ndarray], **kwargs
-    ) -> BaseModel:
-        """
-        Convert the engine outputs to the output schema for the pipeline.
-
-        :param engine_outputs: the outputs from the engine
-        :return: the output schema for the pipeline
-        """
-        sequences = self.tokenizer.batch_decode(
-            engine_outputs[0], skip_special_tokens=True
-        )
-        return TextGenerationOutput(sequences=sequences)
-
-    def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
-        """
-        Convert the input schema for the pipeline to the inputs for the engine.
-
-        :param inputs: the input schema for the pipeline
-        :return: the inputs for the engine
-        """
-        sequences = inputs.sequences
-
-        if isinstance(sequences, List) and all(
-            isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
-        ):
-            # if batch items contain only one sequence but are wrapped in lists, unwrap
-            # for use as tokenizer input
-            sequences = [sequence[0] for sequence in sequences]
-
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        input_tokens = self.tokenizer(
-            sequences,
-            return_tensors="np",
-            max_length=self.sequence_length,
-            padding="max_length",
-        )
-
-        engine_input = self.tokens_to_engine_input(
-            input_tokens, onnx_input_names=self.multitoken_engine._input_names
-        )
-
-        return engine_input
-
-    def engine_forward(
-        self, engine_inputs: List[numpy.ndarray], **kwargs
-    ) -> numpy.ndarray:
-        """
-        :param engine_inputs: list of numpy inputs to
-            Pipeline engine forward pass
-        :return: A numpy array that contains the tokens generated by the model
-        """
-
-        # list of the meaningful tokens in the sequence
-        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
-
-        tokens, kv_cache = self.initial_autoregressive_pass(
-            tokens=tokens, engine_inputs=engine_inputs
-        )
-
-        # establish the number of autoregressive passes to perform
-        num_iterations = self.sequence_length - len(tokens)
-        if self.num_tokens_to_generate:
-            if self.num_tokens_to_generate > num_iterations:
-                raise ValueError(
-                    f"Num_tokens_to_generate ({self.num_tokens_to_generate}) "
-                    f"cannot be greater than sequence_length ({self.sequence_length}) "
-                    f"minus the number of tokens in the input sequence ({len(tokens)})."
-                )
-            num_iterations = self.num_tokens_to_generate
-
-        # perform the remaining autoregressive passes
-        for iter in range(num_iterations):
-            eos_token_found = self.tokenizer.eos_token_id == tokens[-1]
-            if eos_token_found:
-                # fill the token list so that it has the correct sequence length
-                tokens = tokens + [self.tokenizer.pad_token_id] * (
-                    self.sequence_length - len(tokens)
-                )
-                return numpy.array([[tokens]])
-
-            tokens, kv_cache = self.autoregressive_pass(
-                tokens=tokens,
-                kv_cache=kv_cache,
-            )
-
-        # fill the token list so that it has the correct sequence length
-        tokens = tokens + [self.tokenizer.pad_token_id] * (
-            self.sequence_length - len(tokens)
-        )
-        return numpy.array([[tokens]])
-
-    def autoregressive_pass(
-        self,
-        tokens: List[int],
-        kv_cache: Dict[str, numpy.ndarray],
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
-        """
-        Performs an autoregressive pass to generate the next token in the sequence
-        and update the kv_cache with the new key/value pairs.
-
-        1)  Set the attention mask to 1 for the tokens that are already in the sequence
-            and 1 for the `new_token` - at the last position. This is because the
-            `new_token`'s key/value will be added to the set of keys/values
-            at the last position (before being fed to an attention block)
-        2)  Set up the engine inputs
-        3)  Run the engine forward pas
-        4)  Preprocesses the kv cache so that it can be used as input to the next
-            autoregressive pass.
-        5)  Returns the new token sequence and the updated kv cache.
-
-        :param tokens: the current token sequence
-        :param kv_cache: the current kv_cache
-        :return: the new token sequence and the updated kv cache
-        """
-
-        new_token = tokens[-1]
-
-        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, : len(tokens)] = 1
-        attention_mask[:, -1] = 1
-
-        engine_inputs_dict = {
-            "input_ids": numpy.array([[new_token]]),
-            "attention_mask": attention_mask,
-        }
-        engine_inputs_dict.update(kv_cache)
-
-        engine_inputs = [engine_inputs_dict[name] for name in self.engine._input_names]
-
-        new_logits, *new_kvs = self.engine(engine_inputs)
-
-        # rename the output names to match the names expected
-        # in the next autoregressive pass
-        kv_output_names = [
-            name.replace("present", "past_key_values")
-            for name in self.engine._output_names
-            if name.startswith("present")
-        ]
-        kv_cache = dict(zip(kv_output_names, new_kvs))
-        for k, v in kv_cache.items():
-            v[:, :, len(tokens) - 1] = v[:, :, -1]
-            kv_cache[k] = numpy.ascontiguousarray(v[:, :, :-1])
-
-        # Obtain the next token from the logits
-        new_token = TextGenerationPipeline.sample_new_token(
-            logits=new_logits[0, -1, :],
-            deterministic=self.deterministic,
-            temperature=self.sampling_temperature,
-        )
-        tokens.append(new_token)
-
-        return tokens, kv_cache
-
-    def initial_autoregressive_pass(
-        self,
-        tokens: List[int],
-        engine_inputs: List[numpy.ndarray],
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
-        """
-        Performs a single autoregressive pass to initialize the key, value cache.
-
-        1)  Obtains logits and kv cache for the input sequence.
-            From logits, obtains the next token.
-        2)  Preprocesses the kv cache so that it can be used as input to the next
-            autoregressive pass.
-        3)  Returns the new token sequence and the updated kv cache.
-
-        :param tokens: input tokens provided by the user
-        :param engine_inputs: list of numpy inputs to Pipeline
-            engine forward pass
-        :return: the extended token sequence and the kv cache
-        """
-
-        past_logits, *new_kvs = self.multitoken_engine(engine_inputs)
-
-        # rename the output names to match the names expected
-        # in the next autoregressive pass
-        kv_output_names = [
-            name.replace("present", "past_key_values")
-            for name in self.multitoken_engine._output_names
-            if name.startswith("present")
-        ]
-        kv_cache = dict(zip(kv_output_names, new_kvs))
-        for k, v in kv_cache.items():
-            # remove the information about the `new_token` from the cache
-            v = v[:, :, :-1]
-            # zero out all the info that does not pertain to the
-            # "seen" `token` sequence
-            v[:, :, len(tokens) :] = 0.0
-            kv_cache[k] = numpy.ascontiguousarray(v)
-
-        # Obtain the next token from the logits
-        new_token = TextGenerationPipeline.sample_new_token(
-            logits=past_logits[0, len(tokens) - 1],
-            deterministic=self.deterministic,
-            temperature=self.sampling_temperature,
-        )
-        tokens.append(new_token)
-
-        return tokens, kv_cache
-
-    @staticmethod
-    def sample_new_token(
-        logits: numpy.ndarray, deterministic: bool, temperature: float
-    ) -> int:
-        """
-        Samples a token from the logits using the sampling temperature.
-
-        :param logits: the logits from the model
-        :param deterministic: whether to sample from the softmax or take the argmax
-        :param temperature: the sampling temperature
-
-        :return: the sampled token
-        """
-        if deterministic:
-            return numpy.argmax(logits)
-        else:
-            logits /= temperature
-            probs = softmax(logits)
-            return numpy.random.choice(len(probs), p=probs)
-
-    def _setup_multitoken_onnx_file_path(self) -> str:
-        # `setup_onnx_file_path` function rewritten
-        # to setup the multitoken_onnx_file_path
-
-        multitoken_onnx_path = os.path.join(
-            self.model_path, _MODEL_DIR_ONNX_MULTI_TOKEN_NAME
-        )
-        (
-            multitoken_onnx_path,
-            self.multitoken_onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            multitoken_onnx_path, max_length=self.sequence_length
-        )
-
-        return multitoken_onnx_path
-
-    def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
-        # `_initialize_engine` function rewritten
-        # to initialize the multitoken_engine
-
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_multitoken_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_multitoken_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_multitoken_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
-
-    def _setup_onnx_file_path(self) -> str:
-        # `setup_onnx_file_path` function rewritten
-
-        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
-        (
-            onnx_path,
-            self.onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            onnx_path, max_length=self.sequence_length
-        )
-
-        return onnx_path
-
-    def _initialize_engine(self):
-        return None
-
-    def _reinitialize_engine(self) -> Union[Engine, ORTEngine]:
-        # `_initialize_engine` function rewritten
-
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 88d5414992..3fac78592d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -567,6 +567,34 @@ def _register_pipeline_tasks_decorator(pipeline_class: Pipeline):
 
         return _register_pipeline_tasks_decorator
 
+    @staticmethod
+    def create_engine(
+        onnx_file_path: str,
+        engine_type: str,
+        engine_args: Dict,
+        context: Optional[Context] = None,
+    ) -> Union[Engine, MultiModelEngine, ORTEngine]:
+        engine_type = engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if context is not None and isinstance(context, Context):
+                engine_args.pop("num_cores", None)
+                engine_args.pop("scheduler", None)
+                engine_args["context"] = context
+                return MultiModelEngine(
+                    model=onnx_file_path,
+                    **engine_args,
+                )
+            return Engine(onnx_file_path, **engine_args)
+
+        if engine_type == ORT_ENGINE:
+            return ORTEngine(onnx_file_path, **engine_args)
+
+        raise ValueError(
+            f"Unknown engine_type {engine_type}. Supported values include: "
+            f"{SUPPORTED_PIPELINE_ENGINES}"
+        )
+
     @classmethod
     def from_config(
         cls,
@@ -791,26 +819,10 @@ def engine_forward(self, engine_inputs: List[numpy.ndarray]) -> List[numpy.ndarr
         """
         return self.engine(engine_inputs)
 
-    def _initialize_engine(self) -> Union[Engine, ORTEngine]:
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_file_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_file_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_file_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
+    def _initialize_engine(self) -> Union[Engine, MultiModelEngine, ORTEngine]:
+        return Pipeline.create_engine(
+            self.onnx_file_path, self.engine_type, self._engine_args, self.context
+        )
 
     def _identifier(self):
         # get pipeline identifier; used in the context of logging
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
new file mode 100644
index 0000000000..e6d29616a6
--- /dev/null
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -0,0 +1,316 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import numpy
+import onnx
+from pydantic import BaseModel, Field
+
+from deepsparse import Pipeline
+from deepsparse.transformers.pipelines import TransformersPipeline
+from scipy.special import softmax
+
+
+_MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
+_MODEL_DIR_ONNX_NAME = "model.onnx"
+
+__all__ = ["TextGenerationPipeline"]
+
+
+class TextGenerationInput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input sequence(s) to generate "
+        "text from. If a string is provided, "
+        "the model will generate text from the "
+        "provided sequence. If a list of strings "
+        "is provided, the model will "
+        "generate text from each sequence in the list.",
+    )
+
+
+class TextGenerationOutput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input text sequence(s) appended with "
+        "the generated text sequence(s). "
+        "If a string was provided as input, "
+        "a string will be returned. "
+        "If a list of strings was provided as "
+        "input, a list of strings will be returned.",
+    )
+
+
+@Pipeline.register(
+    task="text_generation",
+    task_aliases=["codegen"],
+)
+class TextGenerationPipeline(TransformersPipeline):
+    """
+    Pipeline for text generation tasks.
+
+    :param deterministic: if True, the pipeline will sample from
+        the probability distribution computed from the logits.
+        If False, the pipeline will get the next token by applying
+        an argmax function to the logits.
+    :param sampling_temperature: the temperature to use when sampling
+        from the probability distribution computed from the logits.
+        Higher values will result in more random samples.
+    :param max_generated_tokens: the maximum number of tokens to generate
+        given the input sequence. If None, the model will generate
+        tokens until the end of the sequence is reached.
+        Otherwise it will generate up to the maximum number of tokens or end of
+        sequence is reached.
+    :param kwargs: kwargs to pass to the TransformersPipeline
+    """
+
+    def __init__(
+        self,
+        deterministic: bool = True,
+        sampling_temperature: float = 1.0,
+        max_generated_tokens: Optional[int] = 1024,
+        prompt_batch_threshold: float = 0.25,
+        **kwargs,
+    ):
+        if self._batch_size != 1:
+            raise ValueError("Only batch size 1 is supported for generation pipelines")
+
+        super().__init__(**kwargs, _delay_engine_initialize=True)
+        self.deterministic = deterministic
+        self.sampling_temperature = sampling_temperature
+        self.max_generated_tokens = max_generated_tokens
+        self.prompt_batch_threshold = prompt_batch_threshold
+
+        # setup the single token engine -- used to continually generate tokens
+        self._adapt_onnx_file_sequence_len(sequence_length=1)
+        self._initialize_engine()
+
+        # setup the multitoken engine -- used for large inputs to generate kv cache
+        self._adapt_onnx_file_sequence_len(sequence_length=self.sequence_length)
+        self.multitoken_engine = Pipeline.create_engine(
+            self.onnx_file_path, self.engine_type, self.engine_args, self.context
+        )
+
+    @staticmethod
+    def route_input_to_bucket(
+        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
+    ) -> Pipeline:
+        """
+        This method is used to route the input to the correct pipeline.
+
+        :param args: args to pass to the pipeline
+        :param input_schema: the input schema for the pipeline
+        :param pipelines: the list of pipelines to route the input to
+        :param kwargs: kwargs to pass to the pipeline
+        :return: the pipeline to route the input to
+        """
+        raise ValueError("Bucketing is not supported for generation pipelines")
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the input schema for the pipeline.
+
+        :return: the input schema for the pipeline
+        """
+        return TextGenerationInput
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the output schema for the pipeline.
+
+        :return: the output schema for the pipeline
+        """
+        return TextGenerationOutput
+
+    def process_engine_outputs(
+        self, engine_outputs: List[numpy.ndarray], **kwargs
+    ) -> TextGenerationOutput:
+        """
+        Convert the engine outputs to the output schema for the pipeline.
+
+        :param engine_outputs: the outputs from the engine
+        :return: the output schema for the pipeline
+        """
+        sequences = self.tokenizer.batch_decode(
+            engine_outputs[0], skip_special_tokens=True
+        )
+        return TextGenerationOutput(sequences=sequences)
+
+    def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
+        """
+        Convert the input schema for the pipeline to the inputs for the engine.
+
+        :param inputs: the input schema for the pipeline
+        :return: the inputs for the engine
+        """
+        sequences = inputs.sequences
+
+        if isinstance(sequences, List) and all(
+            isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
+        ):
+            # if batch items contain only one sequence but are wrapped in lists, unwrap
+            # for use as tokenizer input
+            sequences = [sequence[0] for sequence in sequences]
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        input_tokens = self.tokenizer(
+            sequences,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding="max_length",
+        )
+
+        engine_input = self.tokens_to_engine_input(
+            input_tokens, onnx_input_names=self.onnx_input_names
+        )
+
+        return engine_input
+
+    def engine_forward(
+        self, engine_inputs: List[numpy.ndarray], **kwargs
+    ) -> numpy.ndarray:
+        """
+        :param engine_inputs: list of numpy inputs to
+            Pipeline engine forward pass
+        :return: A numpy array that contains the tokens generated by the model
+        """
+        # run the prompt through
+        tokens, kv_cache = self.prompt_inference(engine_inputs)
+
+        # create the generated output
+        max_tokens = (
+            self.max_generated_tokens
+            if self.max_generated_tokens and self.max_generated_tokens > 0
+            else 100 * self.sequence_length
+        )  # set safety for absolute max generation
+        generated = []
+
+        while len(generated) < max_tokens:
+            gen_token, kv_cache = self.autoregressive_inference(tokens, kv_cache)
+            tokens.append(gen_token)
+            generated.append(gen_token)
+
+            if gen_token == self.tokenizer.eos_token_id:
+                break
+
+        return numpy.array([[generated]])
+
+    def autoregressive_inference(
+        self, tokens: List[int], kv_cache: Dict[str, numpy.ndarray]
+    ) -> Tuple[int, Dict[str, numpy.ndarray]]:
+        """
+
+        :param tokens:
+        :param kv_cache:
+        :return:
+        """
+        new_token = tokens[-1]
+
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        attention_mask[:, : len(tokens)] = 1
+        attention_mask[:, -1] = 1
+
+        engine_inputs = {
+            "input_ids": numpy.array([[new_token]]),
+            "attention_mask": attention_mask,
+        }
+        engine_inputs.update(kv_cache)
+        engine_inputs = [engine_inputs[name] for name in self.engine._input_names]
+
+        new_logits, *cache_values = self.engine(engine_inputs)
+        kv_cache = self._assemble_kv_cache(cache_values, tokens)
+
+        # Obtain the next token from the logits
+        generated_token = self.generate_token(new_logits[0, -1, :])
+
+        return generated_token, kv_cache
+
+    def prompt_inference(
+        self, engine_inputs: List[numpy.ndarray]
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+
+        :param engine_inputs:
+        :return:
+        """
+        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
+        new_token = None
+
+        if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
+            # prompt size is small, run autoregressive inference to populate kv cache
+            run_tokens = []
+            kv_cache = {}
+            for token in tokens:
+                run_tokens.append(token)
+                new_token, kv_cache = self.autoregressive_inference(run_tokens, kv_cache)
+        else:
+            # larger prompt size, run through multitoken engine in single pass
+            logits, *cache_values = self.multitoken_engine(engine_inputs)
+            kv_cache = self._assemble_kv_cache(cache_values, tokens)
+            new_token = self.generate_token(logits[0, len(tokens) - 1])
+
+        tokens.append(new_token)
+
+        return tokens, kv_cache
+
+    def generate_token(self, logits: numpy.ndarray) -> int:
+        """
+        Samples a token from the logits using the sampling temperature.
+
+        :param logits: the logits from the model
+
+        :return: the sampled token
+        """
+        if self.deterministic:
+            return numpy.argmax(logits)
+
+        logits /= self.sampling_temperature
+        probs = softmax(logits)
+
+        return numpy.random.choice(len(probs), p=probs)
+
+    def _assemble_kv_cache(
+        self, cache_values: List[numpy.ndarray], tokens: List[int]
+    ) -> Dict[str, numpy.ndarray]:
+        # rename the output names to match the names expected
+        # in the next autoregressive pass
+        cache_keys = [
+            name.replace("present", "past_key_values")
+            for name in self.engine._output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(cache_keys, cache_values))
+        for key, val in kv_cache.items():
+            val[:, :, len(tokens) - 1] = val[:, :, -1]
+            kv_cache[key] = numpy.ascontiguousarray(val[:, :, :-1])
+
+        return kv_cache
+
+    def _adapt_onnx_file_sequence_len(self, sequence_length: int):
+        model = onnx.load(self.onnx_file_path)
+        initializer_input_names = set([node.name for node in model.graph.initializer])
+        external_inputs = [
+            inp for inp in model.graph.input if inp.name not in initializer_input_names
+        ]
+        input_names = []
+        for external_input in external_inputs:
+            # this is removed for now (will need to be accounted for when we start
+            # supporting deepsparse engine
+            external_input.type.tensor_type.shape.dim[0].dim_value = 1
+            external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
+            input_names.append(external_input.name)
+
+        onnx.save(model, self.onnx_file_path)

From 7f1651df272bf4ac6e9e8299b5212fdc99481af3 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 8 May 2023 15:27:34 +0000
Subject: [PATCH 09/68] restore text generation from examples

---
 examples/codegen/text_generation.py | 494 ++++++++++++++++++++++++++++
 1 file changed, 494 insertions(+)
 create mode 100644 examples/codegen/text_generation.py

diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
new file mode 100644
index 0000000000..1812c9ef93
--- /dev/null
+++ b/examples/codegen/text_generation.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from tempfile import NamedTemporaryFile
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import numpy
+import onnx
+from pydantic import BaseModel, Field
+
+from deepsparse import Context, MultiModelEngine, Pipeline
+from deepsparse.pipeline import (
+    DEEPSPARSE_ENGINE,
+    ORT_ENGINE,
+    SUPPORTED_PIPELINE_ENGINES,
+    Engine,
+    ORTEngine,
+)
+from deepsparse.transformers.pipelines import TransformersPipeline
+from scipy.special import softmax
+
+
+_MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
+_MODEL_DIR_ONNX_NAME = "model.onnx"
+
+__all__ = ["TextGenerationPipeline"]
+
+
+def overwrite_transformer_onnx_model_inputs(
+    path: str,
+    batch_size: int = 1,
+    max_length: int = 128,
+    output_path: Optional[str] = None,
+) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
+    """
+    Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
+    Assumes that these are the first and second shape indices of the given model inputs
+    respectively
+
+    :param path: path to the ONNX model to override
+    :param batch_size: batch size to set
+    :param max_length: max sequence length to set
+    :param output_path: if provided, the model will be saved to the given path,
+        otherwise, the model will be saved to a named temporary file that will
+        be deleted after the program exits
+    :return: if no output path, a tuple of the saved path to the model, list of
+        model input names, and reference to the tempfile object will be returned
+        otherwise, only the model input names will be returned
+    """
+    # overwrite input shapes
+    model = onnx.load(path)
+    initializer_input_names = set([node.name for node in model.graph.initializer])
+    external_inputs = [
+        inp for inp in model.graph.input if inp.name not in initializer_input_names
+    ]
+    input_names = []
+    for external_input in external_inputs:
+        # this is removed for now (will need to be accounted for when we start
+        # supporting deepsparse engine
+        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        input_names.append(external_input.name)
+
+    # Save modified model
+    if output_path is None:
+        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
+        onnx.save(model, tmp_file.name)
+
+        return tmp_file.name, input_names, tmp_file
+    else:
+        onnx.save(model, output_path)
+        return input_names
+
+
+class TextGenerationInput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input sequence(s) to generate "
+        "text from. If a string is provided, "
+        "the model will generate text from the "
+        "provided sequence. If a list of strings "
+        "is provided, the model will "
+        "generate text from each sequence in the list.",
+    )
+
+
+class TextGenerationOutput(BaseModel):
+    sequences: Union[str, List[str]] = Field(
+        description="The input text sequence(s) appended with "
+        "the generated text sequence(s). "
+        "If a string was provided as input, "
+        "a string will be returned. "
+        "If a list of strings was provided as "
+        "input, a list of strings will be returned.",
+    )
+
+
+@Pipeline.register(
+    task="text_generation",
+    task_aliases=["codegen"],
+)
+class TextGenerationPipeline(TransformersPipeline):
+    """
+    Pipeline for text generation tasks.
+
+    :param deterministic: if True, the pipeline will sample from
+        the probability distribution computed from the logits.
+        If False, the pipeline will get the next token by applying
+        an argmax function to the logits.
+    :param sampling_temperature: the temperature to use when sampling
+        from the probability distribution computed from the logits.
+        Higher values will result in more random samples.
+    :param num_tokens_to_generate: the number of tokens to generate
+        given the input sequence. If None, the model will generate
+        tokens until the end of the sequence is reached.
+    :param kwargs: kwargs to pass to the TransformersPipeline
+    """
+
+    def __init__(
+        self,
+        deterministic: bool = True,
+        sampling_temperature: float = 1.0,
+        num_tokens_to_generate: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.deterministic = deterministic
+        self.sampling_temperature = sampling_temperature
+        self.num_tokens_to_generate = num_tokens_to_generate
+
+        # set-up the auxiliary multitoken model
+        self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
+        # initialize the auxiliary multitoken engine
+        self.multitoken_engine = self._initialize_multitoken_engine()
+
+        # re-initialize the target model
+        # this will be removed once codegen is productionized
+        self.onnx_path = self._setup_onnx_file_path()
+        self.engine = self._reinitialize_engine()
+
+        if self._batch_size != 1:
+            raise ValueError(
+                "For the sake of simplicity, only dynamic"
+                "batch shape is supported for now. "
+                "Set `batch_size` to 1 or None."
+            )
+
+    @staticmethod
+    def route_input_to_bucket(
+        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
+    ) -> Pipeline:
+        """
+        This method is used to route the input to the correct pipeline.
+
+        :param args: args to pass to the pipeline
+        :param input_schema: the input schema for the pipeline
+        :param pipelines: the list of pipelines to route the input to
+        :param kwargs: kwargs to pass to the pipeline
+        :return: the pipeline to route the input to
+        """
+        raise NotImplementedError
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the input schema for the pipeline.
+
+        :return: the input schema for the pipeline
+        """
+        return TextGenerationInput
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the output schema for the pipeline.
+
+        :return: the output schema for the pipeline
+        """
+        return TextGenerationOutput
+
+    def process_engine_outputs(
+        self, engine_outputs: List[numpy.ndarray], **kwargs
+    ) -> BaseModel:
+        """
+        Convert the engine outputs to the output schema for the pipeline.
+
+        :param engine_outputs: the outputs from the engine
+        :return: the output schema for the pipeline
+        """
+        sequences = self.tokenizer.batch_decode(
+            engine_outputs[0], skip_special_tokens=True
+        )
+        return TextGenerationOutput(sequences=sequences)
+
+    def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
+        """
+        Convert the input schema for the pipeline to the inputs for the engine.
+
+        :param inputs: the input schema for the pipeline
+        :return: the inputs for the engine
+        """
+        sequences = inputs.sequences
+
+        if isinstance(sequences, List) and all(
+            isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
+        ):
+            # if batch items contain only one sequence but are wrapped in lists, unwrap
+            # for use as tokenizer input
+            sequences = [sequence[0] for sequence in sequences]
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        input_tokens = self.tokenizer(
+            sequences,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding="max_length",
+        )
+
+        engine_input = self.tokens_to_engine_input(
+            input_tokens, onnx_input_names=self.multitoken_engine._input_names
+        )
+
+        return engine_input
+
+    def engine_forward(
+        self, engine_inputs: List[numpy.ndarray], **kwargs
+    ) -> numpy.ndarray:
+        """
+        :param engine_inputs: list of numpy inputs to
+            Pipeline engine forward pass
+        :return: A numpy array that contains the tokens generated by the model
+        """
+
+        # list of the meaningful tokens in the sequence
+        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
+
+        tokens, kv_cache = self.initial_autoregressive_pass(
+            tokens=tokens, engine_inputs=engine_inputs
+        )
+
+        # establish the number of autoregressive passes to perform
+        num_iterations = self.sequence_length - len(tokens)
+        if self.num_tokens_to_generate:
+            if self.num_tokens_to_generate > num_iterations:
+                raise ValueError(
+                    f"Num_tokens_to_generate ({self.num_tokens_to_generate}) "
+                    f"cannot be greater than sequence_length ({self.sequence_length}) "
+                    f"minus the number of tokens in the input sequence ({len(tokens)})."
+                )
+            num_iterations = self.num_tokens_to_generate
+
+        # perform the remaining autoregressive passes
+        for iter in range(num_iterations):
+            eos_token_found = self.tokenizer.eos_token_id == tokens[-1]
+            if eos_token_found:
+                # fill the token list so that it has the correct sequence length
+                tokens = tokens + [self.tokenizer.pad_token_id] * (
+                    self.sequence_length - len(tokens)
+                )
+                return numpy.array([[tokens]])
+
+            tokens, kv_cache = self.autoregressive_pass(
+                tokens=tokens,
+                kv_cache=kv_cache,
+            )
+
+        # fill the token list so that it has the correct sequence length
+        tokens = tokens + [self.tokenizer.pad_token_id] * (
+            self.sequence_length - len(tokens)
+        )
+        return numpy.array([[tokens]])
+
+    def autoregressive_pass(
+        self,
+        tokens: List[int],
+        kv_cache: Dict[str, numpy.ndarray],
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        Performs an autoregressive pass to generate the next token in the sequence
+        and update the kv_cache with the new key/value pairs.
+
+        1)  Set the attention mask to 1 for the tokens that are already in the sequence
+            and 1 for the `new_token` - at the last position. This is because the
+            `new_token`'s key/value will be added to the set of keys/values
+            at the last position (before being fed to an attention block)
+        2)  Set up the engine inputs
+        3)  Run the engine forward pas
+        4)  Preprocesses the kv cache so that it can be used as input to the next
+            autoregressive pass.
+        5)  Returns the new token sequence and the updated kv cache.
+
+        :param tokens: the current token sequence
+        :param kv_cache: the current kv_cache
+        :return: the new token sequence and the updated kv cache
+        """
+
+        new_token = tokens[-1]
+
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        attention_mask[:, : len(tokens)] = 1
+        attention_mask[:, -1] = 1
+
+        engine_inputs_dict = {
+            "input_ids": numpy.array([[new_token]]),
+            "attention_mask": attention_mask,
+        }
+        engine_inputs_dict.update(kv_cache)
+
+        engine_inputs = [engine_inputs_dict[name] for name in self.engine._input_names]
+
+        new_logits, *new_kvs = self.engine(engine_inputs)
+
+        # rename the output names to match the names expected
+        # in the next autoregressive pass
+        kv_output_names = [
+            name.replace("present", "past_key_values")
+            for name in self.engine._output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(kv_output_names, new_kvs))
+        for k, v in kv_cache.items():
+            v[:, :, len(tokens) - 1] = v[:, :, -1]
+            kv_cache[k] = numpy.ascontiguousarray(v[:, :, :-1])
+
+        # Obtain the next token from the logits
+        new_token = TextGenerationPipeline.sample_new_token(
+            logits=new_logits[0, -1, :],
+            deterministic=self.deterministic,
+            temperature=self.sampling_temperature,
+        )
+        tokens.append(new_token)
+
+        return tokens, kv_cache
+
+    def initial_autoregressive_pass(
+        self,
+        tokens: List[int],
+        engine_inputs: List[numpy.ndarray],
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        Performs a single autoregressive pass to initialize the key, value cache.
+
+        1)  Obtains logits and kv cache for the input sequence.
+            From logits, obtains the next token.
+        2)  Preprocesses the kv cache so that it can be used as input to the next
+            autoregressive pass.
+        3)  Returns the new token sequence and the updated kv cache.
+
+        :param tokens: input tokens provided by the user
+        :param engine_inputs: list of numpy inputs to Pipeline
+            engine forward pass
+        :return: the extended token sequence and the kv cache
+        """
+
+        past_logits, *new_kvs = self.multitoken_engine(engine_inputs)
+
+        # rename the output names to match the names expected
+        # in the next autoregressive pass
+        kv_output_names = [
+            name.replace("present", "past_key_values")
+            for name in self.multitoken_engine._output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(kv_output_names, new_kvs))
+        for k, v in kv_cache.items():
+            # remove the information about the `new_token` from the cache
+            v = v[:, :, :-1]
+            # zero out all the info that does not pertain to the
+            # "seen" `token` sequence
+            v[:, :, len(tokens) :] = 0.0
+            kv_cache[k] = numpy.ascontiguousarray(v)
+
+        # Obtain the next token from the logits
+        new_token = TextGenerationPipeline.sample_new_token(
+            logits=past_logits[0, len(tokens) - 1],
+            deterministic=self.deterministic,
+            temperature=self.sampling_temperature,
+        )
+        tokens.append(new_token)
+
+        return tokens, kv_cache
+
+    @staticmethod
+    def sample_new_token(
+        logits: numpy.ndarray, deterministic: bool, temperature: float
+    ) -> int:
+        """
+        Samples a token from the logits using the sampling temperature.
+
+        :param logits: the logits from the model
+        :param deterministic: whether to sample from the softmax or take the argmax
+        :param temperature: the sampling temperature
+
+        :return: the sampled token
+        """
+        if deterministic:
+            return numpy.argmax(logits)
+        else:
+            logits /= temperature
+            probs = softmax(logits)
+            return numpy.random.choice(len(probs), p=probs)
+
+    def _setup_multitoken_onnx_file_path(self) -> str:
+        # `setup_onnx_file_path` function rewritten
+        # to setup the multitoken_onnx_file_path
+
+        multitoken_onnx_path = os.path.join(
+            self.model_path, _MODEL_DIR_ONNX_MULTI_TOKEN_NAME
+        )
+        (
+            multitoken_onnx_path,
+            self.multitoken_onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            multitoken_onnx_path, max_length=self.sequence_length
+        )
+
+        return multitoken_onnx_path
+
+    def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
+        # `_initialize_engine` function rewritten
+        # to initialize the multitoken_engine
+
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=self.onnx_multitoken_path,
+                    **self._engine_args,
+                )
+            return Engine(self.onnx_multitoken_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_multitoken_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )
+
+    def _setup_onnx_file_path(self) -> str:
+        # `setup_onnx_file_path` function rewritten
+
+        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path, max_length=self.sequence_length
+        )
+
+        return onnx_path
+
+    def _initialize_engine(self):
+        return None
+
+    def _reinitialize_engine(self) -> Union[Engine, ORTEngine]:
+        # `_initialize_engine` function rewritten
+
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=self.onnx_path,
+                    **self._engine_args,
+                )
+            return Engine(self.onnx_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )

From b85746d509ac64842831e7be703000ce240d375d Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 8 May 2023 17:28:54 +0200
Subject: [PATCH 10/68] [CodeGen] ONNX model loading to support >2Gb models /
 two engines (#991)

---
 examples/codegen/text_generation.py    | 160 ++++++++++++-------------
 src/deepsparse/transformers/helpers.py |  28 +++--
 2 files changed, 94 insertions(+), 94 deletions(-)

diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
index 1812c9ef93..d3bb477e01 100644
--- a/examples/codegen/text_generation.py
+++ b/examples/codegen/text_generation.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import os
-from tempfile import NamedTemporaryFile
 from typing import Dict, List, Optional, Tuple, Type, Union
 
 import numpy
-import onnx
+from onnx import ValueInfoProto
 from pydantic import BaseModel, Field
+from transformers import AutoConfig, AutoTokenizer
 
 from deepsparse import Context, MultiModelEngine, Pipeline
 from deepsparse.pipeline import (
@@ -28,6 +28,7 @@
     Engine,
     ORTEngine,
 )
+from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
 from deepsparse.transformers.pipelines import TransformersPipeline
 from scipy.special import softmax
 
@@ -35,53 +36,54 @@
 _MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
 _MODEL_DIR_ONNX_NAME = "model.onnx"
 
-__all__ = ["TextGenerationPipeline"]
+
+def overwrite_multi_token_onnx_model_inputs(
+    external_inputs: List[ValueInfoProto], batch_size: int, max_length: int
+) -> List[str]:
+    """
+    Overwrite the input shape of the onnx model for multi token generation.
+
+    :param external_inputs: the external inputs of the onnx model
+    :param batch_size: the batch size of the input
+    :param max_length: the max length of the input
+    :return: the input names of the onnx model
+    """
+    input_names = []
+    for external_input in external_inputs:
+        for single_input in external_input.type.tensor_type.shape.dim:
+            if single_input.dim_param == "batch_size":
+                single_input.dim_value = batch_size
+            elif single_input.dim_param == "sequence_length":
+                single_input.dim_value = max_length
+        input_names.append(external_input.name)
+    return input_names
 
 
-def overwrite_transformer_onnx_model_inputs(
-    path: str,
-    batch_size: int = 1,
-    max_length: int = 128,
-    output_path: Optional[str] = None,
-) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
+def overwrite_single_token_onnx_model_inputs(
+    external_inputs: List[ValueInfoProto], batch_size: int, max_length: int
+) -> List[str]:
     """
-    Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
-    Assumes that these are the first and second shape indices of the given model inputs
-    respectively
-
-    :param path: path to the ONNX model to override
-    :param batch_size: batch size to set
-    :param max_length: max sequence length to set
-    :param output_path: if provided, the model will be saved to the given path,
-        otherwise, the model will be saved to a named temporary file that will
-        be deleted after the program exits
-    :return: if no output path, a tuple of the saved path to the model, list of
-        model input names, and reference to the tempfile object will be returned
-        otherwise, only the model input names will be returned
+    Overwrite the input shapes of the onnx model of the single token model.
+
+    :param external_inputs: the external inputs of the onnx model
+    :param batch_size: the batch size to overwrite the input shapes with
+    :param max_length: the max length to overwrite the input shapes with
+    :return: the input names of the onnx model
     """
-    # overwrite input shapes
-    model = onnx.load(path)
-    initializer_input_names = set([node.name for node in model.graph.initializer])
-    external_inputs = [
-        inp for inp in model.graph.input if inp.name not in initializer_input_names
-    ]
     input_names = []
     for external_input in external_inputs:
-        # this is removed for now (will need to be accounted for when we start
-        # supporting deepsparse engine
-        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        for single_input in external_input.type.tensor_type.shape.dim:
+            if single_input.dim_param == "batch_size":
+                single_input.dim_value = batch_size
+            elif single_input.dim_param == "past_sequence_length + sequence_length":
+                single_input.dim_value = max_length
+            elif single_input.dim_param == "past_sequence_length + 1":
+                single_input.dim_value = max_length
         input_names.append(external_input.name)
+    return input_names
 
-    # Save modified model
-    if output_path is None:
-        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-        onnx.save(model, tmp_file.name)
 
-        return tmp_file.name, input_names, tmp_file
-    else:
-        onnx.save(model, output_path)
-        return input_names
+__all__ = ["TextGenerationPipeline"]
 
 
 class TextGenerationInput(BaseModel):
@@ -144,11 +146,6 @@ def __init__(
         # initialize the auxiliary multitoken engine
         self.multitoken_engine = self._initialize_multitoken_engine()
 
-        # re-initialize the target model
-        # this will be removed once codegen is productionized
-        self.onnx_path = self._setup_onnx_file_path()
-        self.engine = self._reinitialize_engine()
-
         if self._batch_size != 1:
             raise ValueError(
                 "For the sake of simplicity, only dynamic"
@@ -412,6 +409,34 @@ def sample_new_token(
             probs = softmax(logits)
             return numpy.random.choice(len(probs), p=probs)
 
+    def setup_onnx_file_path(self) -> str:
+        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
+
+        config_path = self.model_path
+        tokenizer_path = self.model_path
+
+        self.config = AutoConfig.from_pretrained(
+            config_path, finetuning_task=self.task if hasattr(self, "task") else None
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path,
+            model_max_length=self.sequence_length,
+        )
+        self.config_path = os.path.join(config_path, "config.json")
+        self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
+
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path,
+            max_length=self.sequence_length,
+            custom_input_overwrite_func=overwrite_single_token_onnx_model_inputs,
+        )
+
+        return onnx_path
+
     def _setup_multitoken_onnx_file_path(self) -> str:
         # `setup_onnx_file_path` function rewritten
         # to setup the multitoken_onnx_file_path
@@ -424,7 +449,10 @@ def _setup_multitoken_onnx_file_path(self) -> str:
             self.multitoken_onnx_input_names,
             self._temp_model_directory,
         ) = overwrite_transformer_onnx_model_inputs(
-            multitoken_onnx_path, max_length=self.sequence_length
+            multitoken_onnx_path,
+            max_length=self.sequence_length,
+            load_external_data=False,
+            custom_input_overwrite_func=overwrite_multi_token_onnx_model_inputs,
         )
 
         return multitoken_onnx_path
@@ -452,43 +480,3 @@ def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
                 f"Unknown engine_type {self.engine_type}. Supported values include: "
                 f"{SUPPORTED_PIPELINE_ENGINES}"
             )
-
-    def _setup_onnx_file_path(self) -> str:
-        # `setup_onnx_file_path` function rewritten
-
-        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
-        (
-            onnx_path,
-            self.onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            onnx_path, max_length=self.sequence_length
-        )
-
-        return onnx_path
-
-    def _initialize_engine(self):
-        return None
-
-    def _reinitialize_engine(self) -> Union[Engine, ORTEngine]:
-        # `_initialize_engine` function rewritten
-
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d80949eb11..fde8c9132b 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -21,7 +21,7 @@
 import re
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy
 import onnx
@@ -136,6 +136,8 @@ def overwrite_transformer_onnx_model_inputs(
     batch_size: int = 1,
     max_length: int = 128,
     output_path: Optional[str] = None,
+    load_external_data: bool = True,
+    custom_input_overwrite_func: Optional[Callable] = None,
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
     Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
@@ -148,21 +150,32 @@ def overwrite_transformer_onnx_model_inputs(
     :param output_path: if provided, the model will be saved to the given path,
         otherwise, the model will be saved to a named temporary file that will
         be deleted after the program exits
+    :param load_external_data: if True, external data will be loaded into the model
+        graph. If False, external data will not be loaded and the model will be
+        saved without external data
+    :custom_input_overwrite_func: if provided, this function will be called instead
+        of the default input overwrite function. This function should take in a list
+        of external inputs and return a list of the overwritten input names
     :return: if no output path, a tuple of the saved path to the model, list of
         model input names, and reference to the tempfile object will be returned
         otherwise, only the model input names will be returned
     """
     # overwrite input shapes
-    model = onnx.load(path)
+    model = onnx.load_model(path, load_external_data=load_external_data)
     initializer_input_names = set([node.name for node in model.graph.initializer])
     external_inputs = [
         inp for inp in model.graph.input if inp.name not in initializer_input_names
     ]
-    input_names = []
-    for external_input in external_inputs:
-        external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-        external_input.type.tensor_type.shape.dim[1].dim_value = max_length
-        input_names.append(external_input.name)
+    if custom_input_overwrite_func is not None:
+        input_names = custom_input_overwrite_func(
+            external_inputs, batch_size, max_length
+        )
+    else:
+        input_names = []
+        for external_input in external_inputs:
+            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+            external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+            input_names.append(external_input.name)
 
     # Save modified model
     if output_path is None:
@@ -171,7 +184,6 @@ def overwrite_transformer_onnx_model_inputs(
         return tmp_file.name, input_names, tmp_file
     else:
         save_onnx(model, output_path)
-
         return input_names
 
 

From aadc60846456a1b237a842e873aca1ed6e15862d Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 10 May 2023 17:22:03 +0000
Subject: [PATCH 11/68] refactor sucessfull

---
 examples/codegen/text_generation.py           | 166 +++++-----
 src/deepsparse/pipeline.py                    |   2 +-
 src/deepsparse/pipelines/text_generation.py   | 290 ++++++++++++------
 .../pipelines/text_generation_config.py       |  36 +++
 src/deepsparse/tasks.py                       |  21 ++
 src/deepsparse/text_generation/__init__.py    |  13 +
 .../text_generation/pipelines/__init__.py     |  13 +
 .../text_generation/pipelines/codegen.py      |  21 ++
 src/deepsparse/text_generation/utils.py       |  24 ++
 src/deepsparse/transformers/helpers.py        |  10 +-
 10 files changed, 415 insertions(+), 181 deletions(-)
 create mode 100644 src/deepsparse/pipelines/text_generation_config.py
 create mode 100644 src/deepsparse/text_generation/__init__.py
 create mode 100644 src/deepsparse/text_generation/pipelines/__init__.py
 create mode 100644 src/deepsparse/text_generation/pipelines/codegen.py
 create mode 100644 src/deepsparse/text_generation/utils.py

diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
index d3bb477e01..0797a8c658 100644
--- a/examples/codegen/text_generation.py
+++ b/examples/codegen/text_generation.py
@@ -13,12 +13,12 @@
 # limitations under the License.
 
 import os
+from tempfile import NamedTemporaryFile
 from typing import Dict, List, Optional, Tuple, Type, Union
 
 import numpy
-from onnx import ValueInfoProto
+import onnx
 from pydantic import BaseModel, Field
-from transformers import AutoConfig, AutoTokenizer
 
 from deepsparse import Context, MultiModelEngine, Pipeline
 from deepsparse.pipeline import (
@@ -28,62 +28,60 @@
     Engine,
     ORTEngine,
 )
-from deepsparse.transformers.helpers import overwrite_transformer_onnx_model_inputs
+from deepsparse.text_generation.utils import softmax
 from deepsparse.transformers.pipelines import TransformersPipeline
-from scipy.special import softmax
 
 
 _MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
 _MODEL_DIR_ONNX_NAME = "model.onnx"
 
-
-def overwrite_multi_token_onnx_model_inputs(
-    external_inputs: List[ValueInfoProto], batch_size: int, max_length: int
-) -> List[str]:
-    """
-    Overwrite the input shape of the onnx model for multi token generation.
-
-    :param external_inputs: the external inputs of the onnx model
-    :param batch_size: the batch size of the input
-    :param max_length: the max length of the input
-    :return: the input names of the onnx model
-    """
-    input_names = []
-    for external_input in external_inputs:
-        for single_input in external_input.type.tensor_type.shape.dim:
-            if single_input.dim_param == "batch_size":
-                single_input.dim_value = batch_size
-            elif single_input.dim_param == "sequence_length":
-                single_input.dim_value = max_length
-        input_names.append(external_input.name)
-    return input_names
+__all__ = ["TextGenerationPipeline"]
 
 
-def overwrite_single_token_onnx_model_inputs(
-    external_inputs: List[ValueInfoProto], batch_size: int, max_length: int
-) -> List[str]:
+def overwrite_transformer_onnx_model_inputs(
+    path: str,
+    batch_size: int = 1,
+    max_length: int = 128,
+    output_path: Optional[str] = None,
+) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
-    Overwrite the input shapes of the onnx model of the single token model.
-
-    :param external_inputs: the external inputs of the onnx model
-    :param batch_size: the batch size to overwrite the input shapes with
-    :param max_length: the max length to overwrite the input shapes with
-    :return: the input names of the onnx model
+    Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
+    Assumes that these are the first and second shape indices of the given model inputs
+    respectively
+
+    :param path: path to the ONNX model to override
+    :param batch_size: batch size to set
+    :param max_length: max sequence length to set
+    :param output_path: if provided, the model will be saved to the given path,
+        otherwise, the model will be saved to a named temporary file that will
+        be deleted after the program exits
+    :return: if no output path, a tuple of the saved path to the model, list of
+        model input names, and reference to the tempfile object will be returned
+        otherwise, only the model input names will be returned
     """
+    # overwrite input shapes
+    model = onnx.load(path)
+    initializer_input_names = set([node.name for node in model.graph.initializer])
+    external_inputs = [
+        inp for inp in model.graph.input if inp.name not in initializer_input_names
+    ]
     input_names = []
     for external_input in external_inputs:
-        for single_input in external_input.type.tensor_type.shape.dim:
-            if single_input.dim_param == "batch_size":
-                single_input.dim_value = batch_size
-            elif single_input.dim_param == "past_sequence_length + sequence_length":
-                single_input.dim_value = max_length
-            elif single_input.dim_param == "past_sequence_length + 1":
-                single_input.dim_value = max_length
+        # this is removed for now (will need to be accounted for when we start
+        # supporting deepsparse engine
+        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
         input_names.append(external_input.name)
-    return input_names
 
+    # Save modified model
+    if output_path is None:
+        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
+        onnx.save(model, tmp_file.name)
 
-__all__ = ["TextGenerationPipeline"]
+        return tmp_file.name, input_names, tmp_file
+    else:
+        onnx.save(model, output_path)
+        return input_names
 
 
 class TextGenerationInput(BaseModel):
@@ -108,10 +106,6 @@ class TextGenerationOutput(BaseModel):
     )
 
 
-@Pipeline.register(
-    task="text_generation",
-    task_aliases=["codegen"],
-)
 class TextGenerationPipeline(TransformersPipeline):
     """
     Pipeline for text generation tasks.
@@ -146,6 +140,11 @@ def __init__(
         # initialize the auxiliary multitoken engine
         self.multitoken_engine = self._initialize_multitoken_engine()
 
+        # re-initialize the target model
+        # this will be removed once codegen is productionized
+        self.onnx_path = self._setup_onnx_file_path()
+        self.engine = self._reinitialize_engine()
+
         if self._batch_size != 1:
             raise ValueError(
                 "For the sake of simplicity, only dynamic"
@@ -409,34 +408,6 @@ def sample_new_token(
             probs = softmax(logits)
             return numpy.random.choice(len(probs), p=probs)
 
-    def setup_onnx_file_path(self) -> str:
-        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
-
-        config_path = self.model_path
-        tokenizer_path = self.model_path
-
-        self.config = AutoConfig.from_pretrained(
-            config_path, finetuning_task=self.task if hasattr(self, "task") else None
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path,
-            model_max_length=self.sequence_length,
-        )
-        self.config_path = os.path.join(config_path, "config.json")
-        self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
-
-        (
-            onnx_path,
-            self.onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            onnx_path,
-            max_length=self.sequence_length,
-            custom_input_overwrite_func=overwrite_single_token_onnx_model_inputs,
-        )
-
-        return onnx_path
-
     def _setup_multitoken_onnx_file_path(self) -> str:
         # `setup_onnx_file_path` function rewritten
         # to setup the multitoken_onnx_file_path
@@ -449,10 +420,7 @@ def _setup_multitoken_onnx_file_path(self) -> str:
             self.multitoken_onnx_input_names,
             self._temp_model_directory,
         ) = overwrite_transformer_onnx_model_inputs(
-            multitoken_onnx_path,
-            max_length=self.sequence_length,
-            load_external_data=False,
-            custom_input_overwrite_func=overwrite_multi_token_onnx_model_inputs,
+            multitoken_onnx_path, max_length=self.sequence_length
         )
 
         return multitoken_onnx_path
@@ -480,3 +448,43 @@ def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
                 f"Unknown engine_type {self.engine_type}. Supported values include: "
                 f"{SUPPORTED_PIPELINE_ENGINES}"
             )
+
+    def _setup_onnx_file_path(self) -> str:
+        # `setup_onnx_file_path` function rewritten
+
+        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path, max_length=self.sequence_length
+        )
+
+        return onnx_path
+
+    def _initialize_engine(self):
+        return None
+
+    def _reinitialize_engine(self) -> Union[Engine, ORTEngine]:
+        # `_initialize_engine` function rewritten
+
+        engine_type = self.engine_type.lower()
+
+        if engine_type == DEEPSPARSE_ENGINE:
+            if self.context is not None and isinstance(self.context, Context):
+                self._engine_args.pop("num_cores", None)
+                self._engine_args.pop("scheduler", None)
+                self._engine_args["context"] = self.context
+                return MultiModelEngine(
+                    model=self.onnx_path,
+                    **self._engine_args,
+                )
+            return Engine(self.onnx_path, **self._engine_args)
+        elif engine_type == ORT_ENGINE:
+            return ORTEngine(self.onnx_path, **self._engine_args)
+        else:
+            raise ValueError(
+                f"Unknown engine_type {self.engine_type}. Supported values include: "
+                f"{SUPPORTED_PIPELINE_ENGINES}"
+            )
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 3fac78592d..4ffda8800d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(x) for x in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index e6d29616a6..686f4af12a 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -12,42 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, List, Optional, Tuple, Type, Union
+import os
+from typing import Dict, List, Optional, Tuple, Type
 
 import numpy
 import onnx
+from onnx import ValueInfoProto
 from pydantic import BaseModel, Field
+from transformers import AutoConfig, AutoTokenizer
 
 from deepsparse import Pipeline
+from deepsparse.transformers.helpers import (
+    get_onnx_path_and_configs,
+    overwrite_transformer_onnx_model_inputs,
+)
 from deepsparse.transformers.pipelines import TransformersPipeline
-from scipy.special import softmax
 
 
+# TODO: to be deprecated after Sage's changes, we will only need a single model
 _MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
-_MODEL_DIR_ONNX_NAME = "model.onnx"
 
 __all__ = ["TextGenerationPipeline"]
 
 
+def softmax(x: numpy.ndarray) -> numpy.ndarray:
+    """
+    Compute softmax values for x
+    :param x: input array
+    :return: softmax values
+    """
+    return numpy.exp(x) / numpy.sum(numpy.exp(x), axis=0)
+
+
 class TextGenerationInput(BaseModel):
-    sequences: Union[str, List[str]] = Field(
-        description="The input sequence(s) to generate "
-        "text from. If a string is provided, "
-        "the model will generate text from the "
-        "provided sequence. If a list of strings "
-        "is provided, the model will "
-        "generate text from each sequence in the list.",
+    sequence: str = Field(
+        description="The input sequence to generate the text from.",
     )
 
 
 class TextGenerationOutput(BaseModel):
-    sequences: Union[str, List[str]] = Field(
-        description="The input text sequence(s) appended with "
-        "the generated text sequence(s). "
-        "If a string was provided as input, "
-        "a string will be returned. "
-        "If a list of strings was provided as "
-        "input, a list of strings will be returned.",
+    sequence: str = Field(
+        description="The generated text sequence.",
     )
 
 
@@ -82,23 +87,23 @@ def __init__(
         prompt_batch_threshold: float = 0.25,
         **kwargs,
     ):
+        super().__init__(**kwargs)
+
         if self._batch_size != 1:
             raise ValueError("Only batch size 1 is supported for generation pipelines")
 
-        super().__init__(**kwargs, _delay_engine_initialize=True)
         self.deterministic = deterministic
         self.sampling_temperature = sampling_temperature
         self.max_generated_tokens = max_generated_tokens
         self.prompt_batch_threshold = prompt_batch_threshold
 
-        # setup the single token engine -- used to continually generate tokens
-        self._adapt_onnx_file_sequence_len(sequence_length=1)
-        self._initialize_engine()
-
-        # setup the multitoken engine -- used for large inputs to generate kv cache
-        self._adapt_onnx_file_sequence_len(sequence_length=self.sequence_length)
+        # additional setup the multitoken engine,
+        # used for large inputs to generate kv cache
+        # TODO: to be deprecated after Sage's changes
+        self.onnx_multitoken_path = self.setup_onnx_file_path(multitoken=True)
+        # initialize the auxiliary multitoken engine
         self.multitoken_engine = Pipeline.create_engine(
-            self.onnx_file_path, self.engine_type, self.engine_args, self.context
+            self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
         )
 
     @staticmethod
@@ -134,20 +139,6 @@ def output_schema(self) -> Type[BaseModel]:
         """
         return TextGenerationOutput
 
-    def process_engine_outputs(
-        self, engine_outputs: List[numpy.ndarray], **kwargs
-    ) -> TextGenerationOutput:
-        """
-        Convert the engine outputs to the output schema for the pipeline.
-
-        :param engine_outputs: the outputs from the engine
-        :return: the output schema for the pipeline
-        """
-        sequences = self.tokenizer.batch_decode(
-            engine_outputs[0], skip_special_tokens=True
-        )
-        return TextGenerationOutput(sequences=sequences)
-
     def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         """
         Convert the input schema for the pipeline to the inputs for the engine.
@@ -155,34 +146,38 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         :param inputs: the input schema for the pipeline
         :return: the inputs for the engine
         """
-        sequences = inputs.sequences
-
-        if isinstance(sequences, List) and all(
-            isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
-        ):
-            # if batch items contain only one sequence but are wrapped in lists, unwrap
-            # for use as tokenizer input
-            sequences = [sequence[0] for sequence in sequences]
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
         input_tokens = self.tokenizer(
-            sequences,
+            inputs.sequence,
             return_tensors="np",
             max_length=self.sequence_length,
             padding="max_length",
         )
 
-        engine_input = self.tokens_to_engine_input(
-            input_tokens, onnx_input_names=self.onnx_input_names
-        )
+        engine_input = self.tokens_to_engine_input(input_tokens)
 
         return engine_input
 
+    def process_engine_outputs(
+        self, engine_outputs: List[numpy.ndarray], **kwargs
+    ) -> TextGenerationOutput:
+        """
+        Convert the engine outputs to the output schema for the pipeline.
+
+        :param engine_outputs: the outputs from the engine
+        :return: the output schema for the pipeline
+        """
+        sequence = self.tokenizer.decode(engine_outputs[0][0], skip_special_tokens=True)
+        return TextGenerationOutput(sequence=sequence)
+
     def engine_forward(
         self, engine_inputs: List[numpy.ndarray], **kwargs
     ) -> numpy.ndarray:
         """
+        Run the forward pass on the engine.
+
         :param engine_inputs: list of numpy inputs to
             Pipeline engine forward pass
         :return: A numpy array that contains the tokens generated by the model
@@ -191,12 +186,13 @@ def engine_forward(
         tokens, kv_cache = self.prompt_inference(engine_inputs)
 
         # create the generated output
+        # TODO: Get clarity here, are we running the sliding window there?
         max_tokens = (
             self.max_generated_tokens
             if self.max_generated_tokens and self.max_generated_tokens > 0
             else 100 * self.sequence_length
         )  # set safety for absolute max generation
-        generated = []
+        generated = [tokens[-1]]
 
         while len(generated) < max_tokens:
             gen_token, kv_cache = self.autoregressive_inference(tokens, kv_cache)
@@ -208,17 +204,61 @@ def engine_forward(
 
         return numpy.array([[generated]])
 
+    def prompt_inference(
+        self, engine_inputs: List[numpy.ndarray]
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        An inference run that processes the prompt through the
+        model to generate the new token and populate the kv cache.
+
+        :param engine_inputs: the prompt (context) represented by a
+            list of numpy inputs to the engine
+        :return:
+            - the list of prompt tokens plus the new, generated token
+            - the kv cache that was populated during the inference
+        """
+        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
+        new_token = None
+
+        if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
+            # prompt size is small, run autoregressive inference to populate kv cache
+            run_tokens = []
+            kv_cache = {}
+            for token in tokens:
+                run_tokens.append(token)
+                new_token, kv_cache = self.autoregressive_inference(
+                    run_tokens, kv_cache
+                )
+        else:
+            # larger prompt size, run through multi-token engine in single pass
+            logits, *cache_values = self.multitoken_engine(engine_inputs)
+            kv_cache = self._assemble_kv_cache(
+                cache_values, tokens, prompt_inference=True
+            )
+            new_token = self.generate_token(logits[0, len(tokens) - 1])
+
+            tokens.append(new_token)
+
+        return tokens, kv_cache
+
     def autoregressive_inference(
         self, tokens: List[int], kv_cache: Dict[str, numpy.ndarray]
     ) -> Tuple[int, Dict[str, numpy.ndarray]]:
         """
+        An inference run that processes the last token and the kv cache to
+        generate a new token and update the kv cache.
 
-        :param tokens:
-        :param kv_cache:
+        :param tokens: The current context (prompt + generated tokens so far)
+        :param kv_cache: The key-value cache from the previous inference run
         :return:
+            - the list of prompt tokens plus the new, generated token
+            - the kv cache that was populated during the inference
         """
         new_token = tokens[-1]
 
+        # Create the boolean attention mask:
+        # e.g. [1, 1, 1, 1, 1, 0, 0, ..., 1] where first 1's correspond
+        # to the kv_cache and the last one corresponds to the new token
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
         attention_mask[:, : len(tokens)] = 1
         attention_mask[:, -1] = 1
@@ -227,6 +267,8 @@ def autoregressive_inference(
             "input_ids": numpy.array([[new_token]]),
             "attention_mask": attention_mask,
         }
+
+        kv_cache = kv_cache if kv_cache else self._initialize_kv_cache()
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine._input_names]
 
@@ -238,34 +280,6 @@ def autoregressive_inference(
 
         return generated_token, kv_cache
 
-    def prompt_inference(
-        self, engine_inputs: List[numpy.ndarray]
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
-        """
-
-        :param engine_inputs:
-        :return:
-        """
-        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
-        new_token = None
-
-        if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
-            # prompt size is small, run autoregressive inference to populate kv cache
-            run_tokens = []
-            kv_cache = {}
-            for token in tokens:
-                run_tokens.append(token)
-                new_token, kv_cache = self.autoregressive_inference(run_tokens, kv_cache)
-        else:
-            # larger prompt size, run through multitoken engine in single pass
-            logits, *cache_values = self.multitoken_engine(engine_inputs)
-            kv_cache = self._assemble_kv_cache(cache_values, tokens)
-            new_token = self.generate_token(logits[0, len(tokens) - 1])
-
-        tokens.append(new_token)
-
-        return tokens, kv_cache
-
     def generate_token(self, logits: numpy.ndarray) -> int:
         """
         Samples a token from the logits using the sampling temperature.
@@ -278,12 +292,81 @@ def generate_token(self, logits: numpy.ndarray) -> int:
             return numpy.argmax(logits)
 
         logits /= self.sampling_temperature
+
         probs = softmax(logits)
 
         return numpy.random.choice(len(probs), p=probs)
 
+    def setup_onnx_file_path(self, multitoken: bool = False):
+        """
+        Parses ONNX, tokenizer, and config file paths from the given `model_path`.
+        Supports sparsezoo stubs
+
+        :return: file path to the processed ONNX file for the engine to compile
+        """
+        if multitoken:
+            onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
+                self.model_path,
+                require_configs=True,
+                model_dir_onnx_name=_MODEL_DIR_ONNX_MULTI_TOKEN_NAME,
+            )
+        else:
+            onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
+                self.model_path, require_configs=True
+            )
+
+        self.config = AutoConfig.from_pretrained(
+            config_path, finetuning_task=self.task if hasattr(self, "task") else None
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path,
+            model_max_length=self.sequence_length,
+        )
+        self.config_path = os.path.join(config_path, "config.json")
+        self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
+
+        # overwrite onnx graph to given required input shape
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path,
+            max_length=self.sequence_length,
+            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
+        )
+
+        model = onnx.load_model(onnx_path, load_external_data=False)
+        self.external_outputs = [out for out in model.graph.output]
+
+        return onnx_path
+
+    def _initialize_kv_cache(self):
+        # initialize empty kv cache
+        empty_kv_cache_tensor = numpy.zeros(
+            (
+                self._batch_size,  # batch size
+                self.external_outputs[1]
+                .type.tensor_type.shape.dim[1]
+                .dim_value,  # num heads
+                self.sequence_length - 1,  # sequence length - 1
+                self.external_outputs[1].type.tensor_type.shape.dim[3].dim_value,
+            ),
+            dtype=numpy.float32,
+        )  # hidden size
+
+        cache_keys = [
+            output.name.replace("present", "past_key_values")
+            for output in self.external_outputs
+            if output.name.startswith("present")
+        ]
+        return {key: empty_kv_cache_tensor for key in cache_keys}
+
     def _assemble_kv_cache(
-        self, cache_values: List[numpy.ndarray], tokens: List[int]
+        self,
+        cache_values: List[numpy.ndarray],
+        tokens: List[int],
+        prompt_inference=False,
     ) -> Dict[str, numpy.ndarray]:
         # rename the output names to match the names expected
         # in the next autoregressive pass
@@ -294,23 +377,36 @@ def _assemble_kv_cache(
         ]
         kv_cache = dict(zip(cache_keys, cache_values))
         for key, val in kv_cache.items():
-            val[:, :, len(tokens) - 1] = val[:, :, -1]
-            kv_cache[key] = numpy.ascontiguousarray(val[:, :, :-1])
+            if prompt_inference:
+                # remove the information about the `new_token` from the cache
+                val = val[:, :, :-1]
+                # zero out all the info that does not pertain to the
+                # "seen" `token` sequence
+                val[:, :, len(tokens) :] = 0.0
+                kv_cache[key] = numpy.ascontiguousarray(val)
+
+            else:
+                # move the information about the `new_token` to the
+                # end of the valid cache
+                val[:, :, len(tokens) - 1] = val[:, :, -1]
+                kv_cache[key] = numpy.ascontiguousarray(val[:, :, :-1])
 
         return kv_cache
 
-    def _adapt_onnx_file_sequence_len(self, sequence_length: int):
-        model = onnx.load(self.onnx_file_path)
-        initializer_input_names = set([node.name for node in model.graph.initializer])
-        external_inputs = [
-            inp for inp in model.graph.input if inp.name not in initializer_input_names
-        ]
+    @staticmethod
+    def overwrite_onnx_model_inputs(
+        external_inputs: List[ValueInfoProto], batch_size: int, max_length: int
+    ) -> List[str]:
+        """
+        Overwrite the input shape of the onnx model.
+
+        :param external_inputs: the external inputs of the onnx model
+        :param batch_size: the batch size of the input
+        :param max_length: the max length of the input
+        :return: the input names of the onnx model
+        """
         input_names = []
         for external_input in external_inputs:
-            # this is removed for now (will need to be accounted for when we start
-            # supporting deepsparse engine
-            external_input.type.tensor_type.shape.dim[0].dim_value = 1
-            external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
+            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
             input_names.append(external_input.name)
-
-        onnx.save(model, self.onnx_file_path)
+        return input_names
diff --git a/src/deepsparse/pipelines/text_generation_config.py b/src/deepsparse/pipelines/text_generation_config.py
new file mode 100644
index 0000000000..1a9d9db97c
--- /dev/null
+++ b/src/deepsparse/pipelines/text_generation_config.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+CODEGEN_INPUTS_CONFIG = {
+    "input_ids": ["batch_size", "sequence_length"],
+    "attention_mask": ["batch_size", "past_sequence_length + sequence_length"],
+    "past_key_values": [
+        "batch_size",
+        "num_heads",
+        "past_sequence_length + 1",
+        "hidden_size",
+    ],
+}
+
+
+def softmax(x: np.ndarray) -> np.ndarray:
+    """
+    Compute softmax values for x
+    :param x: input array
+    :return: softmax values
+    """
+    return np.exp(x) / np.sum(np.exp(x), axis=0)
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index aa6c349eb6..41a9b3472f 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -95,6 +95,10 @@ class SupportedTasks:
         ),
     )
 
+    text_generation = namedtuple("text_generation", ["codegen"])(
+        codegen=AliasedTask("codegen", []),
+    )
+
     image_classification = namedtuple("image_classification", ["image_classification"])(
         image_classification=AliasedTask(
             "image_classification",
@@ -150,6 +154,9 @@ def check_register_task(
             # custom task, register the CustomPipeline
             import deepsparse.pipelines.custom_pipeline  # noqa: F401
 
+        elif cls.is_text_generation(task):
+            import deepsparse.pipelines.text_generation
+
         elif cls.is_nlp(task):
             # trigger transformers pipelines to register with Pipeline.register
             import deepsparse.transformers.pipelines  # noqa: F401
@@ -193,6 +200,20 @@ def check_register_task(
                 f"{list(all_tasks)}"
             )
 
+    @classmethod
+    def is_text_generation(cls, task: str) -> bool:
+        """
+        :param task: the name of the task to check whether it is a text generation task
+            such as codegen
+        :return: True if it is a text generation task, False otherwise
+        """
+        return any(
+            [
+                text_generation_task.matches(task)
+                for text_generation_task in cls.text_generation
+            ]
+        )
+
     @classmethod
     def is_nlp(cls, task: str) -> bool:
         """
diff --git a/src/deepsparse/text_generation/__init__.py b/src/deepsparse/text_generation/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/src/deepsparse/text_generation/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/deepsparse/text_generation/pipelines/__init__.py b/src/deepsparse/text_generation/pipelines/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/src/deepsparse/text_generation/pipelines/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/src/deepsparse/text_generation/pipelines/codegen.py b/src/deepsparse/text_generation/pipelines/codegen.py
new file mode 100644
index 0000000000..5f18e9101a
--- /dev/null
+++ b/src/deepsparse/text_generation/pipelines/codegen.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from deepsparse import Pipeline
+from deepsparse.pipelines.text_generation import TextGenerationPipeline
+
+
+@Pipeline.register(task="codegen", default_model_path=None)
+class CodeGenPipeline(TextGenerationPipeline):
+    super().__init__()
diff --git a/src/deepsparse/text_generation/utils.py b/src/deepsparse/text_generation/utils.py
new file mode 100644
index 0000000000..54f4555295
--- /dev/null
+++ b/src/deepsparse/text_generation/utils.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def softmax(x: np.ndarray) -> np.ndarray:
+    """
+    Compute softmax values for x
+    :param x: input array
+    :return: softmax values
+    """
+    return np.exp(x) / np.sum(np.exp(x), axis=0)
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index fde8c9132b..8a1e174018 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -52,6 +52,7 @@
 def get_onnx_path_and_configs(
     model_path: str,
     require_configs: bool = False,
+    model_dir_onnx_name: str = _MODEL_DIR_ONNX_NAME,
 ) -> Tuple[str, Optional[str], Optional[str]]:
     """
     :param model_path: path to onnx file, transformers sparsezoo stub,
@@ -61,6 +62,7 @@ def get_onnx_path_and_configs(
     :param require_configs: if True, model_path must be a directory containing
         `model.onnx`, `config.json`, and `tokenizer.json` files. Will raise
         an exception otherwise
+    :param model_dir_onnx_name: name of onnx file in model directory
     :return: tuple of ONNX file path, parent directory of config file
         if it exists, and parent directory of tokenizer config file if it
         exists. (Parent directories returned instead of absolute path
@@ -75,13 +77,13 @@ def get_onnx_path_and_configs(
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if _MODEL_DIR_ONNX_NAME not in model_files:
+        if model_dir_onnx_name not in model_files:
             raise ValueError(
-                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
+                f"{model_dir_onnx_name} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
+                f"{os.path.join(model_path, model_dir_onnx_name)}"
             )
-        onnx_path = os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
+        onnx_path = os.path.join(model_path, model_dir_onnx_name)
 
         # attempt to read config and tokenizer from sparsezoo-like framework directory
         framework_dir = None

From 58bc2b0e70c7d55b6111b4931aa522e5d6188168 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 11 May 2023 10:51:49 +0000
Subject: [PATCH 12/68] Pipeline fully refactored, time to test engine support.
 Note: Sliding window not yet implemented!

---
 examples/codegen/README.md                    |  99 ----
 examples/codegen/text_generation.py           | 490 ------------------
 src/deepsparse/pipeline.py                    |   2 +-
 .../pipelines/text_generation_config.py       |  36 --
 src/deepsparse/text_generation/__init__.py    |  13 -
 .../text_generation/pipelines/__init__.py     |  13 -
 .../text_generation/pipelines/codegen.py      |  21 -
 src/deepsparse/text_generation/utils.py       |  24 -
 8 files changed, 1 insertion(+), 697 deletions(-)
 delete mode 100644 examples/codegen/README.md
 delete mode 100644 examples/codegen/text_generation.py
 delete mode 100644 src/deepsparse/pipelines/text_generation_config.py
 delete mode 100644 src/deepsparse/text_generation/__init__.py
 delete mode 100644 src/deepsparse/text_generation/pipelines/__init__.py
 delete mode 100644 src/deepsparse/text_generation/pipelines/codegen.py
 delete mode 100644 src/deepsparse/text_generation/utils.py

diff --git a/examples/codegen/README.md b/examples/codegen/README.md
deleted file mode 100644
index 1e47c01f7b..0000000000
--- a/examples/codegen/README.md
+++ /dev/null
@@ -1,99 +0,0 @@
-<!--
-Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-## ONNX Export
-Firstly, we need to install HuggingFace optimum library
-```bash
-pip install optimum
-```
-
-### Patch the original PyTorch Model
-First apply the following modification to this file in your transformers installation:
-https://github.com/huggingface/transformers/blob/main/src/transformers/models/codegen/modeling_codegen.py#L212
-
-\```diff
--offset = layer_past[0].shape[-2]
-+offset = (attention_mask[0] == 0.0).sum() - 1.0
-\```
-
-We need to do this because the existing with_past implementations assume there is no padding in the inputs. With deepsparse, we need to use static sequence length, which means our offset for the embeddings will depend on how many non-padded inputs we receive.
-
-The new line checks this with the attention_mask. At this point in the code, attention_mask has been transformed from a tensor with 0s and 1s, to a tensor of `float.min` and `0.0`. So when we compare `attention_mask == 0.0` we are actually saying everywhere the attention_mask is 1.
-
-We also need to subtract 1 from this count, because the attention mask is applied AFTER the kv cache is concatenated to the new token, which means the attention mask will actually be sequence length + 1 items. So we subtract 1 to get the current sequence length.
-
-### Export the model to ONNX
-
-```bash
-optimum-cli export onnx --model Salesforce/codegen-350M-multi codegen-350M-multi
-```
-This saves the model to directory `codegen-350-multi`
-
-### Updating Model's Inputs Outputs Dimension Sizes 
-TODO
-
-## Running in the DeepSparse Pipeline
-
-First, we need to rename `decoder_with_past_model.onnx` to `model.onnx` inside
-the `static-codegen-350-multi`, to abide the naming convention
-
-Finally, run the pipeline:
-
-```python
-from examples.codegen.text_generation import TextGenerationPipeline
-
-codegen = TextGenerationPipeline(
-    model_path="/network/damian/static-codegen-350M-multi",
-    engine_type="onnxruntime",
-    sequence_length=128)
-
-out = codegen(sequences="def hello_world():")
-print(out.sequences[0])
-```
-
-```bash
-def hello_world():
-    return 'Hello World!'
-
-def hello_world_2():
-    return 'Hello World!'
-
-def hello_world_3():
-    return 'Hello World!'
-
-def hello_world_4():
-    return 'Hello World!'
-
-def hello_world_5():
-    return 'Hello World!'
-
-def hello_world_6():
-    return 'Hello World!'
-
-def hello_world_7():
-    return 'Hello World!'
-
-def hello_world_8():
-    return 'Hello World!'
-
-def hello
-```
-
-Modifying pipeline behaviour:
-1. By adding argument `deterministic=False`, the next token of the sequence will not be chosen deterministically (using argmax), but will be
-sampled from the probablility distribution.
-2. By setting `sampling_temperature` when `deterministic=False`, we are allowing more or less randomness in the sampling method (https://towardsdatascience.com/how-to-sample-from-language-models-682bceb97277)
-3. By setting `num_tokens_to_generate`, we strictly specify how many tokens we want to generate per input.
diff --git a/examples/codegen/text_generation.py b/examples/codegen/text_generation.py
deleted file mode 100644
index 0797a8c658..0000000000
--- a/examples/codegen/text_generation.py
+++ /dev/null
@@ -1,490 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from tempfile import NamedTemporaryFile
-from typing import Dict, List, Optional, Tuple, Type, Union
-
-import numpy
-import onnx
-from pydantic import BaseModel, Field
-
-from deepsparse import Context, MultiModelEngine, Pipeline
-from deepsparse.pipeline import (
-    DEEPSPARSE_ENGINE,
-    ORT_ENGINE,
-    SUPPORTED_PIPELINE_ENGINES,
-    Engine,
-    ORTEngine,
-)
-from deepsparse.text_generation.utils import softmax
-from deepsparse.transformers.pipelines import TransformersPipeline
-
-
-_MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
-_MODEL_DIR_ONNX_NAME = "model.onnx"
-
-__all__ = ["TextGenerationPipeline"]
-
-
-def overwrite_transformer_onnx_model_inputs(
-    path: str,
-    batch_size: int = 1,
-    max_length: int = 128,
-    output_path: Optional[str] = None,
-) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
-    """
-    Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
-    Assumes that these are the first and second shape indices of the given model inputs
-    respectively
-
-    :param path: path to the ONNX model to override
-    :param batch_size: batch size to set
-    :param max_length: max sequence length to set
-    :param output_path: if provided, the model will be saved to the given path,
-        otherwise, the model will be saved to a named temporary file that will
-        be deleted after the program exits
-    :return: if no output path, a tuple of the saved path to the model, list of
-        model input names, and reference to the tempfile object will be returned
-        otherwise, only the model input names will be returned
-    """
-    # overwrite input shapes
-    model = onnx.load(path)
-    initializer_input_names = set([node.name for node in model.graph.initializer])
-    external_inputs = [
-        inp for inp in model.graph.input if inp.name not in initializer_input_names
-    ]
-    input_names = []
-    for external_input in external_inputs:
-        # this is removed for now (will need to be accounted for when we start
-        # supporting deepsparse engine
-        # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-        # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
-        input_names.append(external_input.name)
-
-    # Save modified model
-    if output_path is None:
-        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-        onnx.save(model, tmp_file.name)
-
-        return tmp_file.name, input_names, tmp_file
-    else:
-        onnx.save(model, output_path)
-        return input_names
-
-
-class TextGenerationInput(BaseModel):
-    sequences: Union[str, List[str]] = Field(
-        description="The input sequence(s) to generate "
-        "text from. If a string is provided, "
-        "the model will generate text from the "
-        "provided sequence. If a list of strings "
-        "is provided, the model will "
-        "generate text from each sequence in the list.",
-    )
-
-
-class TextGenerationOutput(BaseModel):
-    sequences: Union[str, List[str]] = Field(
-        description="The input text sequence(s) appended with "
-        "the generated text sequence(s). "
-        "If a string was provided as input, "
-        "a string will be returned. "
-        "If a list of strings was provided as "
-        "input, a list of strings will be returned.",
-    )
-
-
-class TextGenerationPipeline(TransformersPipeline):
-    """
-    Pipeline for text generation tasks.
-
-    :param deterministic: if True, the pipeline will sample from
-        the probability distribution computed from the logits.
-        If False, the pipeline will get the next token by applying
-        an argmax function to the logits.
-    :param sampling_temperature: the temperature to use when sampling
-        from the probability distribution computed from the logits.
-        Higher values will result in more random samples.
-    :param num_tokens_to_generate: the number of tokens to generate
-        given the input sequence. If None, the model will generate
-        tokens until the end of the sequence is reached.
-    :param kwargs: kwargs to pass to the TransformersPipeline
-    """
-
-    def __init__(
-        self,
-        deterministic: bool = True,
-        sampling_temperature: float = 1.0,
-        num_tokens_to_generate: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.deterministic = deterministic
-        self.sampling_temperature = sampling_temperature
-        self.num_tokens_to_generate = num_tokens_to_generate
-
-        # set-up the auxiliary multitoken model
-        self.onnx_multitoken_path = self._setup_multitoken_onnx_file_path()
-        # initialize the auxiliary multitoken engine
-        self.multitoken_engine = self._initialize_multitoken_engine()
-
-        # re-initialize the target model
-        # this will be removed once codegen is productionized
-        self.onnx_path = self._setup_onnx_file_path()
-        self.engine = self._reinitialize_engine()
-
-        if self._batch_size != 1:
-            raise ValueError(
-                "For the sake of simplicity, only dynamic"
-                "batch shape is supported for now. "
-                "Set `batch_size` to 1 or None."
-            )
-
-    @staticmethod
-    def route_input_to_bucket(
-        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
-    ) -> Pipeline:
-        """
-        This method is used to route the input to the correct pipeline.
-
-        :param args: args to pass to the pipeline
-        :param input_schema: the input schema for the pipeline
-        :param pipelines: the list of pipelines to route the input to
-        :param kwargs: kwargs to pass to the pipeline
-        :return: the pipeline to route the input to
-        """
-        raise NotImplementedError
-
-    @property
-    def input_schema(self) -> Type[BaseModel]:
-        """
-        Property to return the input schema for the pipeline.
-
-        :return: the input schema for the pipeline
-        """
-        return TextGenerationInput
-
-    @property
-    def output_schema(self) -> Type[BaseModel]:
-        """
-        Property to return the output schema for the pipeline.
-
-        :return: the output schema for the pipeline
-        """
-        return TextGenerationOutput
-
-    def process_engine_outputs(
-        self, engine_outputs: List[numpy.ndarray], **kwargs
-    ) -> BaseModel:
-        """
-        Convert the engine outputs to the output schema for the pipeline.
-
-        :param engine_outputs: the outputs from the engine
-        :return: the output schema for the pipeline
-        """
-        sequences = self.tokenizer.batch_decode(
-            engine_outputs[0], skip_special_tokens=True
-        )
-        return TextGenerationOutput(sequences=sequences)
-
-    def process_inputs(self, inputs: BaseModel) -> List[numpy.ndarray]:
-        """
-        Convert the input schema for the pipeline to the inputs for the engine.
-
-        :param inputs: the input schema for the pipeline
-        :return: the inputs for the engine
-        """
-        sequences = inputs.sequences
-
-        if isinstance(sequences, List) and all(
-            isinstance(sequence, List) and len(sequence) == 1 for sequence in sequences
-        ):
-            # if batch items contain only one sequence but are wrapped in lists, unwrap
-            # for use as tokenizer input
-            sequences = [sequence[0] for sequence in sequences]
-
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        input_tokens = self.tokenizer(
-            sequences,
-            return_tensors="np",
-            max_length=self.sequence_length,
-            padding="max_length",
-        )
-
-        engine_input = self.tokens_to_engine_input(
-            input_tokens, onnx_input_names=self.multitoken_engine._input_names
-        )
-
-        return engine_input
-
-    def engine_forward(
-        self, engine_inputs: List[numpy.ndarray], **kwargs
-    ) -> numpy.ndarray:
-        """
-        :param engine_inputs: list of numpy inputs to
-            Pipeline engine forward pass
-        :return: A numpy array that contains the tokens generated by the model
-        """
-
-        # list of the meaningful tokens in the sequence
-        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
-
-        tokens, kv_cache = self.initial_autoregressive_pass(
-            tokens=tokens, engine_inputs=engine_inputs
-        )
-
-        # establish the number of autoregressive passes to perform
-        num_iterations = self.sequence_length - len(tokens)
-        if self.num_tokens_to_generate:
-            if self.num_tokens_to_generate > num_iterations:
-                raise ValueError(
-                    f"Num_tokens_to_generate ({self.num_tokens_to_generate}) "
-                    f"cannot be greater than sequence_length ({self.sequence_length}) "
-                    f"minus the number of tokens in the input sequence ({len(tokens)})."
-                )
-            num_iterations = self.num_tokens_to_generate
-
-        # perform the remaining autoregressive passes
-        for iter in range(num_iterations):
-            eos_token_found = self.tokenizer.eos_token_id == tokens[-1]
-            if eos_token_found:
-                # fill the token list so that it has the correct sequence length
-                tokens = tokens + [self.tokenizer.pad_token_id] * (
-                    self.sequence_length - len(tokens)
-                )
-                return numpy.array([[tokens]])
-
-            tokens, kv_cache = self.autoregressive_pass(
-                tokens=tokens,
-                kv_cache=kv_cache,
-            )
-
-        # fill the token list so that it has the correct sequence length
-        tokens = tokens + [self.tokenizer.pad_token_id] * (
-            self.sequence_length - len(tokens)
-        )
-        return numpy.array([[tokens]])
-
-    def autoregressive_pass(
-        self,
-        tokens: List[int],
-        kv_cache: Dict[str, numpy.ndarray],
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
-        """
-        Performs an autoregressive pass to generate the next token in the sequence
-        and update the kv_cache with the new key/value pairs.
-
-        1)  Set the attention mask to 1 for the tokens that are already in the sequence
-            and 1 for the `new_token` - at the last position. This is because the
-            `new_token`'s key/value will be added to the set of keys/values
-            at the last position (before being fed to an attention block)
-        2)  Set up the engine inputs
-        3)  Run the engine forward pas
-        4)  Preprocesses the kv cache so that it can be used as input to the next
-            autoregressive pass.
-        5)  Returns the new token sequence and the updated kv cache.
-
-        :param tokens: the current token sequence
-        :param kv_cache: the current kv_cache
-        :return: the new token sequence and the updated kv cache
-        """
-
-        new_token = tokens[-1]
-
-        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, : len(tokens)] = 1
-        attention_mask[:, -1] = 1
-
-        engine_inputs_dict = {
-            "input_ids": numpy.array([[new_token]]),
-            "attention_mask": attention_mask,
-        }
-        engine_inputs_dict.update(kv_cache)
-
-        engine_inputs = [engine_inputs_dict[name] for name in self.engine._input_names]
-
-        new_logits, *new_kvs = self.engine(engine_inputs)
-
-        # rename the output names to match the names expected
-        # in the next autoregressive pass
-        kv_output_names = [
-            name.replace("present", "past_key_values")
-            for name in self.engine._output_names
-            if name.startswith("present")
-        ]
-        kv_cache = dict(zip(kv_output_names, new_kvs))
-        for k, v in kv_cache.items():
-            v[:, :, len(tokens) - 1] = v[:, :, -1]
-            kv_cache[k] = numpy.ascontiguousarray(v[:, :, :-1])
-
-        # Obtain the next token from the logits
-        new_token = TextGenerationPipeline.sample_new_token(
-            logits=new_logits[0, -1, :],
-            deterministic=self.deterministic,
-            temperature=self.sampling_temperature,
-        )
-        tokens.append(new_token)
-
-        return tokens, kv_cache
-
-    def initial_autoregressive_pass(
-        self,
-        tokens: List[int],
-        engine_inputs: List[numpy.ndarray],
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
-        """
-        Performs a single autoregressive pass to initialize the key, value cache.
-
-        1)  Obtains logits and kv cache for the input sequence.
-            From logits, obtains the next token.
-        2)  Preprocesses the kv cache so that it can be used as input to the next
-            autoregressive pass.
-        3)  Returns the new token sequence and the updated kv cache.
-
-        :param tokens: input tokens provided by the user
-        :param engine_inputs: list of numpy inputs to Pipeline
-            engine forward pass
-        :return: the extended token sequence and the kv cache
-        """
-
-        past_logits, *new_kvs = self.multitoken_engine(engine_inputs)
-
-        # rename the output names to match the names expected
-        # in the next autoregressive pass
-        kv_output_names = [
-            name.replace("present", "past_key_values")
-            for name in self.multitoken_engine._output_names
-            if name.startswith("present")
-        ]
-        kv_cache = dict(zip(kv_output_names, new_kvs))
-        for k, v in kv_cache.items():
-            # remove the information about the `new_token` from the cache
-            v = v[:, :, :-1]
-            # zero out all the info that does not pertain to the
-            # "seen" `token` sequence
-            v[:, :, len(tokens) :] = 0.0
-            kv_cache[k] = numpy.ascontiguousarray(v)
-
-        # Obtain the next token from the logits
-        new_token = TextGenerationPipeline.sample_new_token(
-            logits=past_logits[0, len(tokens) - 1],
-            deterministic=self.deterministic,
-            temperature=self.sampling_temperature,
-        )
-        tokens.append(new_token)
-
-        return tokens, kv_cache
-
-    @staticmethod
-    def sample_new_token(
-        logits: numpy.ndarray, deterministic: bool, temperature: float
-    ) -> int:
-        """
-        Samples a token from the logits using the sampling temperature.
-
-        :param logits: the logits from the model
-        :param deterministic: whether to sample from the softmax or take the argmax
-        :param temperature: the sampling temperature
-
-        :return: the sampled token
-        """
-        if deterministic:
-            return numpy.argmax(logits)
-        else:
-            logits /= temperature
-            probs = softmax(logits)
-            return numpy.random.choice(len(probs), p=probs)
-
-    def _setup_multitoken_onnx_file_path(self) -> str:
-        # `setup_onnx_file_path` function rewritten
-        # to setup the multitoken_onnx_file_path
-
-        multitoken_onnx_path = os.path.join(
-            self.model_path, _MODEL_DIR_ONNX_MULTI_TOKEN_NAME
-        )
-        (
-            multitoken_onnx_path,
-            self.multitoken_onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            multitoken_onnx_path, max_length=self.sequence_length
-        )
-
-        return multitoken_onnx_path
-
-    def _initialize_multitoken_engine(self) -> Union[Engine, ORTEngine]:
-        # `_initialize_engine` function rewritten
-        # to initialize the multitoken_engine
-
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_multitoken_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_multitoken_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_multitoken_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
-
-    def _setup_onnx_file_path(self) -> str:
-        # `setup_onnx_file_path` function rewritten
-
-        onnx_path = os.path.join(self.model_path, _MODEL_DIR_ONNX_NAME)
-        (
-            onnx_path,
-            self.onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            onnx_path, max_length=self.sequence_length
-        )
-
-        return onnx_path
-
-    def _initialize_engine(self):
-        return None
-
-    def _reinitialize_engine(self) -> Union[Engine, ORTEngine]:
-        # `_initialize_engine` function rewritten
-
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 4ffda8800d..3fac78592d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = [self.engine_forward(x) for x in batches]
+        batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/pipelines/text_generation_config.py b/src/deepsparse/pipelines/text_generation_config.py
deleted file mode 100644
index 1a9d9db97c..0000000000
--- a/src/deepsparse/pipelines/text_generation_config.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-CODEGEN_INPUTS_CONFIG = {
-    "input_ids": ["batch_size", "sequence_length"],
-    "attention_mask": ["batch_size", "past_sequence_length + sequence_length"],
-    "past_key_values": [
-        "batch_size",
-        "num_heads",
-        "past_sequence_length + 1",
-        "hidden_size",
-    ],
-}
-
-
-def softmax(x: np.ndarray) -> np.ndarray:
-    """
-    Compute softmax values for x
-    :param x: input array
-    :return: softmax values
-    """
-    return np.exp(x) / np.sum(np.exp(x), axis=0)
diff --git a/src/deepsparse/text_generation/__init__.py b/src/deepsparse/text_generation/__init__.py
deleted file mode 100644
index 0c44f887a4..0000000000
--- a/src/deepsparse/text_generation/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/src/deepsparse/text_generation/pipelines/__init__.py b/src/deepsparse/text_generation/pipelines/__init__.py
deleted file mode 100644
index 0c44f887a4..0000000000
--- a/src/deepsparse/text_generation/pipelines/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/src/deepsparse/text_generation/pipelines/codegen.py b/src/deepsparse/text_generation/pipelines/codegen.py
deleted file mode 100644
index 5f18e9101a..0000000000
--- a/src/deepsparse/text_generation/pipelines/codegen.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from deepsparse import Pipeline
-from deepsparse.pipelines.text_generation import TextGenerationPipeline
-
-
-@Pipeline.register(task="codegen", default_model_path=None)
-class CodeGenPipeline(TextGenerationPipeline):
-    super().__init__()
diff --git a/src/deepsparse/text_generation/utils.py b/src/deepsparse/text_generation/utils.py
deleted file mode 100644
index 54f4555295..0000000000
--- a/src/deepsparse/text_generation/utils.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-def softmax(x: np.ndarray) -> np.ndarray:
-    """
-    Compute softmax values for x
-    :param x: input array
-    :return: softmax values
-    """
-    return np.exp(x) / np.sum(np.exp(x), axis=0)

From d53844478f5f6512e2571c721a5cad65baec2601 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 11 May 2023 12:27:12 +0000
Subject: [PATCH 13/68] First iteration with Sage

---
 src/deepsparse/engine.py                    | 53 +++++++++++++++++++++
 src/deepsparse/pipeline.py                  |  7 ++-
 src/deepsparse/pipelines/text_generation.py | 30 ++++++++----
 3 files changed, 80 insertions(+), 10 deletions(-)

diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index e08bb06b0c..26ebabf78f 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -844,6 +844,59 @@ def __init__(
                 context.value,
             )
 
+class KVCacheEngine(Engine):
+    """
+    Engine that can do kv caching.
+    """
+    def __init__(
+            self,
+            model: Union[str, "Model", "File"],
+            batch_size: int = 1,
+            num_cores: int = None,
+            num_streams: int = None,
+            scheduler: Scheduler = None,
+            input_shapes: List[List[int]] = None,
+    ):
+        _analytics.send_event("python__engine__init")
+        self._model_path = model_to_path(model)
+        self._batch_size = _validate_batch_size(batch_size)
+        self._num_cores = _validate_num_cores(num_cores)
+        self._scheduler = _validate_scheduler(scheduler)
+        self._input_shapes = input_shapes
+        self._cpu_avx_type = AVX_TYPE
+        self._cpu_vnni = VNNI
+
+        num_streams = _validate_num_streams(num_streams, self._num_cores)
+        if self._input_shapes:
+            raise NotImplementedError("")
+            # with override_onnx_input_shapes(
+            #         self._model_path, self._input_shapes
+            # ) as model_path:
+            #     self._eng_net = LIB.deepsparse_engine(
+            #         model_path,
+            #         self._batch_size,
+            #         self._num_cores,
+            #         num_streams,
+            #         self._scheduler.value,
+            #         None,
+            #     )
+        else:
+            # create a boolean list of every output of the
+            # model (logits, key0, value0, key1, value1, ..., key19, value, 19)
+            kv_cache_bools = [True for i in range(41)]
+            kv_cache_bools[0] = False # logits ought not to be cached
+
+            self._eng_net = LIB.deepsparse_engine(
+                self._model_path,
+                self._batch_size,
+                self._num_cores,
+                num_streams,
+                self._scheduler.value,
+                None,
+                kv_cache_bools, # pass in the boolean list
+                0 # since we start with no initial cache, pass in 0 for the initial cached position
+            )
+
 
 class KVCacheEngine(Engine):
     """
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 3fac78592d..999ef190c6 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -25,7 +25,7 @@
 import numpy
 from pydantic import BaseModel, Field
 
-from deepsparse import Context, Engine, MultiModelEngine, Scheduler
+from deepsparse import Context, Engine, KVCacheEngine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
 from deepsparse.cpu import cpu_details
 from deepsparse.loggers.base_logger import BaseLogger
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(x) for x in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
@@ -573,6 +573,7 @@ def create_engine(
         engine_type: str,
         engine_args: Dict,
         context: Optional[Context] = None,
+        support_kv_cache: bool = False
     ) -> Union[Engine, MultiModelEngine, ORTEngine]:
         engine_type = engine_type.lower()
 
@@ -585,6 +586,8 @@ def create_engine(
                     model=onnx_file_path,
                     **engine_args,
                 )
+            if support_kv_cache:
+                return KVCacheEngine(onnx_file_path, **engine_args)
             return Engine(onnx_file_path, **engine_args)
 
         if engine_type == ORT_ENGINE:
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 686f4af12a..2924c722f3 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -87,7 +87,7 @@ def __init__(
         prompt_batch_threshold: float = 0.25,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(**kwargs, _delay_engine_initialize=True)
 
         if self._batch_size != 1:
             raise ValueError("Only batch size 1 is supported for generation pipelines")
@@ -97,14 +97,15 @@ def __init__(
         self.max_generated_tokens = max_generated_tokens
         self.prompt_batch_threshold = prompt_batch_threshold
 
+        self.engine = Pipeline.create_engine(
+            self.onnx_file_path, self.engine_type, self.engine_args, self.context, support_kv_cache=True)
         # additional setup the multitoken engine,
         # used for large inputs to generate kv cache
         # TODO: to be deprecated after Sage's changes
         self.onnx_multitoken_path = self.setup_onnx_file_path(multitoken=True)
         # initialize the auxiliary multitoken engine
-        self.multitoken_engine = Pipeline.create_engine(
-            self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
-        )
+        #self.multitoken_engine = Pipeline.create_engine(
+        #    self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context)
 
     @staticmethod
     def route_input_to_bucket(
@@ -221,6 +222,7 @@ def prompt_inference(
         new_token = None
 
         if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
+            print('first')
             # prompt size is small, run autoregressive inference to populate kv cache
             run_tokens = []
             kv_cache = {}
@@ -230,6 +232,7 @@ def prompt_inference(
                     run_tokens, kv_cache
                 )
         else:
+            print('second')
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
             kv_cache = self._assemble_kv_cache(
@@ -270,7 +273,7 @@ def autoregressive_inference(
 
         kv_cache = kv_cache if kv_cache else self._initialize_kv_cache()
         engine_inputs.update(kv_cache)
-        engine_inputs = [engine_inputs[name] for name in self.engine._input_names]
+        engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
         new_logits, *cache_values = self.engine(engine_inputs)
         kv_cache = self._assemble_kv_cache(cache_values, tokens)
@@ -372,7 +375,7 @@ def _assemble_kv_cache(
         # in the next autoregressive pass
         cache_keys = [
             name.replace("present", "past_key_values")
-            for name in self.engine._output_names
+            for name in self.engine.output_names
             if name.startswith("present")
         ]
         kv_cache = dict(zip(cache_keys, cache_values))
@@ -395,7 +398,7 @@ def _assemble_kv_cache(
 
     @staticmethod
     def overwrite_onnx_model_inputs(
-        external_inputs: List[ValueInfoProto], batch_size: int, max_length: int
+        external_inputs: List[ValueInfoProto], batch_size: int, sequence_length: int
     ) -> List[str]:
         """
         Overwrite the input shape of the onnx model.
@@ -407,6 +410,17 @@ def overwrite_onnx_model_inputs(
         """
         input_names = []
         for external_input in external_inputs:
-            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+            if external_input.name == 'input_ids':
+                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+                external_input.type.tensor_type.shape.dim[1].dim_value = 1
+            elif external_input.name == 'attention_mask':
+                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
+            else:
+                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+                external_input.type.tensor_type.shape.dim[1].dim_value = 16
+                external_input.type.tensor_type.shape.dim[2].dim_value = sequence_length - 1
+                external_input.type.tensor_type.shape.dim[3].dim_value = 64
+
             input_names.append(external_input.name)
         return input_names

From e19676b0c7a9bc7180712aed6fed4b2061de89b9 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Thu, 11 May 2023 14:29:38 +0200
Subject: [PATCH 14/68] Apply suggestions from code review

---
 src/deepsparse/pipeline.py                  | 2 +-
 src/deepsparse/pipelines/text_generation.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 999ef190c6..319e25eee1 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = [self.engine_forward(x) for x in batches]
+        batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 2924c722f3..c1a5ef519e 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -222,7 +222,6 @@ def prompt_inference(
         new_token = None
 
         if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
-            print('first')
             # prompt size is small, run autoregressive inference to populate kv cache
             run_tokens = []
             kv_cache = {}
@@ -232,7 +231,6 @@ def prompt_inference(
                     run_tokens, kv_cache
                 )
         else:
-            print('second')
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
             kv_cache = self._assemble_kv_cache(

From 7908b7499ba5ceaa9c30b4150d7ff4b051e0fcc0 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 11 May 2023 19:07:42 +0000
Subject: [PATCH 15/68] ORT agrees with the Engine. But they both give not
 entirely correct result. Hey, this is good news still

---
 src/deepsparse/engine.py                    | 22 +++++++++++----------
 src/deepsparse/pipeline.py                  |  2 +-
 src/deepsparse/pipelines/text_generation.py | 19 ++++++++++++------
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index 26ebabf78f..6d6c4116ea 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -844,18 +844,20 @@ def __init__(
                 context.value,
             )
 
+
 class KVCacheEngine(Engine):
     """
     Engine that can do kv caching.
     """
+
     def __init__(
-            self,
-            model: Union[str, "Model", "File"],
-            batch_size: int = 1,
-            num_cores: int = None,
-            num_streams: int = None,
-            scheduler: Scheduler = None,
-            input_shapes: List[List[int]] = None,
+        self,
+        model: Union[str, "Model", "File"],
+        batch_size: int = 1,
+        num_cores: int = None,
+        num_streams: int = None,
+        scheduler: Scheduler = None,
+        input_shapes: List[List[int]] = None,
     ):
         _analytics.send_event("python__engine__init")
         self._model_path = model_to_path(model)
@@ -884,7 +886,7 @@ def __init__(
             # create a boolean list of every output of the
             # model (logits, key0, value0, key1, value1, ..., key19, value, 19)
             kv_cache_bools = [True for i in range(41)]
-            kv_cache_bools[0] = False # logits ought not to be cached
+            kv_cache_bools[0] = False  # logits ought not to be cached
 
             self._eng_net = LIB.deepsparse_engine(
                 self._model_path,
@@ -893,8 +895,8 @@ def __init__(
                 num_streams,
                 self._scheduler.value,
                 None,
-                kv_cache_bools, # pass in the boolean list
-                0 # since we start with no initial cache, pass in 0 for the initial cached position
+                kv_cache_bools,  # pass in the boolean list
+                0,  # since we start with no initial cache, pass in 0 for the initial cached position
             )
 
 
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 319e25eee1..4dd75401b8 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -573,7 +573,7 @@ def create_engine(
         engine_type: str,
         engine_args: Dict,
         context: Optional[Context] = None,
-        support_kv_cache: bool = False
+        support_kv_cache: bool = False,
     ) -> Union[Engine, MultiModelEngine, ORTEngine]:
         engine_type = engine_type.lower()
 
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index c1a5ef519e..80afb8f109 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -98,13 +98,18 @@ def __init__(
         self.prompt_batch_threshold = prompt_batch_threshold
 
         self.engine = Pipeline.create_engine(
-            self.onnx_file_path, self.engine_type, self.engine_args, self.context, support_kv_cache=True)
+            self.onnx_file_path,
+            self.engine_type,
+            self.engine_args,
+            self.context,
+            support_kv_cache=True,
+        )
         # additional setup the multitoken engine,
         # used for large inputs to generate kv cache
         # TODO: to be deprecated after Sage's changes
         self.onnx_multitoken_path = self.setup_onnx_file_path(multitoken=True)
         # initialize the auxiliary multitoken engine
-        #self.multitoken_engine = Pipeline.create_engine(
+        # self.multitoken_engine = Pipeline.create_engine(
         #    self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context)
 
     @staticmethod
@@ -238,7 +243,7 @@ def prompt_inference(
             )
             new_token = self.generate_token(logits[0, len(tokens) - 1])
 
-            tokens.append(new_token)
+        tokens.append(new_token)
 
         return tokens, kv_cache
 
@@ -408,16 +413,18 @@ def overwrite_onnx_model_inputs(
         """
         input_names = []
         for external_input in external_inputs:
-            if external_input.name == 'input_ids':
+            if external_input.name == "input_ids":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = 1
-            elif external_input.name == 'attention_mask':
+            elif external_input.name == "attention_mask":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
             else:
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = 16
-                external_input.type.tensor_type.shape.dim[2].dim_value = sequence_length - 1
+                external_input.type.tensor_type.shape.dim[2].dim_value = (
+                    sequence_length - 1
+                )
                 external_input.type.tensor_type.shape.dim[3].dim_value = 64
 
             input_names.append(external_input.name)

From 4bc3472cf5d15e4ed2bdd611b32dd42970eff80e Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 12 May 2023 17:02:31 +0000
Subject: [PATCH 16/68] dynamic ORT vs static DS

---
 src/deepsparse/benchmark/ort_engine.py      |  3 +-
 src/deepsparse/pipeline.py                  |  2 +-
 src/deepsparse/pipelines/text_generation.py | 61 +++++++++++++--------
 src/deepsparse/transformers/helpers.py      |  4 +-
 src/deepsparse/utils/onnx.py                |  2 +
 5 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/src/deepsparse/benchmark/ort_engine.py b/src/deepsparse/benchmark/ort_engine.py
index d2d61e83a1..44dc0c475e 100644
--- a/src/deepsparse/benchmark/ort_engine.py
+++ b/src/deepsparse/benchmark/ort_engine.py
@@ -281,7 +281,8 @@ def run(
         :return: The list of outputs from the model after executing over the inputs
         """
         if val_inp:
-            self._validate_inputs(inp)
+            pass
+            #self._validate_inputs(inp)
         inputs_dict = {name: value for name, value in zip(self.input_names, inp)}
         return self._eng_net.run(self.output_names, inputs_dict)
 
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 4dd75401b8..5ad6bf3f78 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(x) for x in  batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 80afb8f109..dca32d51f3 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -265,14 +265,21 @@ def autoregressive_inference(
         # Create the boolean attention mask:
         # e.g. [1, 1, 1, 1, 1, 0, 0, ..., 1] where first 1's correspond
         # to the kv_cache and the last one corresponds to the new token
-        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, : len(tokens)] = 1
-        attention_mask[:, -1] = 1
+        if self.engine_type == "onnxruntime":
+            attention_mask = numpy.ones((1, len(tokens)), dtype=numpy.int64)
+            engine_inputs = {
+                "input_ids": numpy.array([[new_token]]),
+                "attention_mask": attention_mask,
+                "cache_length": numpy.array(len(tokens) - 1, dtype=numpy.int64),
+            }
+        else:
+            attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+            attention_mask[:, :len(tokens)] = 1
 
-        engine_inputs = {
-            "input_ids": numpy.array([[new_token]]),
-            "attention_mask": attention_mask,
-        }
+            engine_inputs = {
+                "input_ids": numpy.array([[new_token]]),
+                "attention_mask": attention_mask,
+            }
 
         kv_cache = kv_cache if kv_cache else self._initialize_kv_cache()
         engine_inputs.update(kv_cache)
@@ -318,7 +325,9 @@ def setup_onnx_file_path(self, multitoken: bool = False):
             )
         else:
             onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
-                self.model_path, require_configs=True
+                self.model_path,
+                require_configs=True,
+                model_dir_onnx_name = "model_fixed.onnx" if self.engine_type == "onnxruntime" else "model.onnx",
             )
 
         self.config = AutoConfig.from_pretrained(
@@ -339,7 +348,7 @@ def setup_onnx_file_path(self, multitoken: bool = False):
         ) = overwrite_transformer_onnx_model_inputs(
             onnx_path,
             max_length=self.sequence_length,
-            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
+            custom_input_overwrite_func=self.overwrite_onnx_model_inputs if not self.engine_type == "onnxruntime" else None,
         )
 
         model = onnx.load_model(onnx_path, load_external_data=False)
@@ -355,7 +364,7 @@ def _initialize_kv_cache(self):
                 self.external_outputs[1]
                 .type.tensor_type.shape.dim[1]
                 .dim_value,  # num heads
-                self.sequence_length - 1,  # sequence length - 1
+                0 if self.engine_type == "onnxruntime" else self.sequence_length - 1,
                 self.external_outputs[1].type.tensor_type.shape.dim[3].dim_value,
             ),
             dtype=numpy.float32,
@@ -382,20 +391,22 @@ def _assemble_kv_cache(
             if name.startswith("present")
         ]
         kv_cache = dict(zip(cache_keys, cache_values))
-        for key, val in kv_cache.items():
-            if prompt_inference:
-                # remove the information about the `new_token` from the cache
-                val = val[:, :, :-1]
-                # zero out all the info that does not pertain to the
-                # "seen" `token` sequence
-                val[:, :, len(tokens) :] = 0.0
-                kv_cache[key] = numpy.ascontiguousarray(val)
 
-            else:
-                # move the information about the `new_token` to the
-                # end of the valid cache
-                val[:, :, len(tokens) - 1] = val[:, :, -1]
-                kv_cache[key] = numpy.ascontiguousarray(val[:, :, :-1])
+        if not self.engine_type == "onnxruntime":
+            for key, val in kv_cache.items():
+                if prompt_inference:
+                    # remove the information about the `new_token` from the cache
+                    val = val[:, :, :-1]
+                    # zero out all the info that does not pertain to the
+                    # "seen" `token` sequence
+                    val[:, :, len(tokens) :] = 0.0
+                    kv_cache[key] = numpy.ascontiguousarray(val)
+
+                else:
+                    # move the information about the `new_token` to the
+                    # end of the valid cache
+                    val[:, :, len(tokens) - 1] = val[:, :, -1]
+                    kv_cache[key] = numpy.ascontiguousarray(val[:, :, :-1])
 
         return kv_cache
 
@@ -419,13 +430,15 @@ def overwrite_onnx_model_inputs(
             elif external_input.name == "attention_mask":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
-            else:
+            elif external_input.name.startswith("past_key_values"):
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = 16
                 external_input.type.tensor_type.shape.dim[2].dim_value = (
                     sequence_length - 1
                 )
                 external_input.type.tensor_type.shape.dim[3].dim_value = 64
+            else:
+                pass
 
             input_names.append(external_input.name)
         return input_names
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 8a1e174018..7db8dd80bd 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -175,8 +175,8 @@ def overwrite_transformer_onnx_model_inputs(
     else:
         input_names = []
         for external_input in external_inputs:
-            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-            external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+            #external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+            #external_input.type.tensor_type.shape.dim[1].dim_value = max_length
             input_names.append(external_input.name)
 
     # Save modified model
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 326c4b215d..83ead7a5ab 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -211,6 +211,8 @@ def override_onnx_batch_size(
         input for input in all_inputs if input.name not in initializer_input_names
     ]
     for external_input in external_inputs:
+        if external_input.name == "cache_length":
+            continue
         external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
 
     # Save modified model, this will be cleaned up when context is exited

From c07f7ed8712e37683e2c9e43e822654b2dba1f11 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 16 May 2023 19:20:47 +0000
Subject: [PATCH 17/68] pipeline handles OPT multitoken pass

---
 src/deepsparse/benchmark/ort_engine.py      |   2 +-
 src/deepsparse/pipeline.py                  |   6 +-
 src/deepsparse/pipelines/text_generation.py | 136 ++++++++------------
 src/deepsparse/tasks.py                     |   3 +-
 src/deepsparse/transformers/helpers.py      |   4 +-
 5 files changed, 65 insertions(+), 86 deletions(-)

diff --git a/src/deepsparse/benchmark/ort_engine.py b/src/deepsparse/benchmark/ort_engine.py
index 44dc0c475e..9a79705f78 100644
--- a/src/deepsparse/benchmark/ort_engine.py
+++ b/src/deepsparse/benchmark/ort_engine.py
@@ -282,7 +282,7 @@ def run(
         """
         if val_inp:
             pass
-            #self._validate_inputs(inp)
+            # self._validate_inputs(inp)
         inputs_dict = {name: value for name, value in zip(self.input_names, inp)}
         return self._eng_net.run(self.output_names, inputs_dict)
 
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 5ad6bf3f78..105e828365 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -260,10 +260,12 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         # ------ INFERENCE ------
         # split inputs into batches of size `self._batch_size`
         timer.start(InferencePhases.ENGINE_FORWARD)
-        batches = self.split_engine_inputs(engine_inputs, self._batch_size)
+        # Hack to enable inference with `cache_length` argument
+        # batches = self.split_engine_inputs(engine_inputs, self._batch_size)
+        batches = [engine_inputs]
 
         # submit split batches to engine threadpool
-        batch_outputs = [self.engine_forward(x) for x in  batches]
+        batch_outputs = [self.engine_forward(x) for x in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index dca32d51f3..aeeea8addd 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -58,7 +58,7 @@ class TextGenerationOutput(BaseModel):
 
 @Pipeline.register(
     task="text_generation",
-    task_aliases=["codegen"],
+    task_aliases=["codegen", "opt"],
 )
 class TextGenerationPipeline(TransformersPipeline):
     """
@@ -96,21 +96,20 @@ def __init__(
         self.sampling_temperature = sampling_temperature
         self.max_generated_tokens = max_generated_tokens
         self.prompt_batch_threshold = prompt_batch_threshold
-
+        # when we are done with the static inference in ORT,
+        # set support_kv_cache = True
         self.engine = Pipeline.create_engine(
             self.onnx_file_path,
             self.engine_type,
             self.engine_args,
             self.context,
-            support_kv_cache=True,
         )
-        # additional setup the multitoken engine,
-        # used for large inputs to generate kv cache
-        # TODO: to be deprecated after Sage's changes
+
         self.onnx_multitoken_path = self.setup_onnx_file_path(multitoken=True)
         # initialize the auxiliary multitoken engine
-        # self.multitoken_engine = Pipeline.create_engine(
-        #    self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context)
+        self.multitoken_engine = Pipeline.create_engine(
+            self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
+        )
 
     @staticmethod
     def route_input_to_bucket(
@@ -162,6 +161,10 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             padding="max_length",
         )
 
+        kv_cache = self._initialize_kv_cache()
+        cache_length = {"cache_length": numpy.array(0, dtype=numpy.int64)}
+
+        input_tokens = {**input_tokens, **kv_cache, **cache_length}
         engine_input = self.tokens_to_engine_input(input_tokens)
 
         return engine_input
@@ -192,7 +195,7 @@ def engine_forward(
         tokens, kv_cache = self.prompt_inference(engine_inputs)
 
         # create the generated output
-        # TODO: Get clarity here, are we running the sliding window there?
+        # TODO: Apply sliding window logic
         max_tokens = (
             self.max_generated_tokens
             if self.max_generated_tokens and self.max_generated_tokens > 0
@@ -238,10 +241,8 @@ def prompt_inference(
         else:
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
-            kv_cache = self._assemble_kv_cache(
-                cache_values, tokens, prompt_inference=True
-            )
-            new_token = self.generate_token(logits[0, len(tokens) - 1])
+            kv_cache = self._assemble_kv_cache(cache_values, tokens)
+            new_token = self.generate_token(logits[0, : len(tokens) + 1])
 
         tokens.append(new_token)
 
@@ -262,26 +263,15 @@ def autoregressive_inference(
         """
         new_token = tokens[-1]
 
-        # Create the boolean attention mask:
-        # e.g. [1, 1, 1, 1, 1, 0, 0, ..., 1] where first 1's correspond
-        # to the kv_cache and the last one corresponds to the new token
-        if self.engine_type == "onnxruntime":
-            attention_mask = numpy.ones((1, len(tokens)), dtype=numpy.int64)
-            engine_inputs = {
-                "input_ids": numpy.array([[new_token]]),
-                "attention_mask": attention_mask,
-                "cache_length": numpy.array(len(tokens) - 1, dtype=numpy.int64),
-            }
-        else:
-            attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-            attention_mask[:, :len(tokens)] = 1
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        attention_mask[:, : len(tokens) + 1] = 1
 
-            engine_inputs = {
-                "input_ids": numpy.array([[new_token]]),
-                "attention_mask": attention_mask,
-            }
+        engine_inputs = {
+            "input_ids": numpy.array([[new_token]]),
+            "attention_mask": attention_mask,
+            "cache_length": numpy.array(len(tokens), dtype=numpy.int64),
+        }
 
-        kv_cache = kv_cache if kv_cache else self._initialize_kv_cache()
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
@@ -317,28 +307,20 @@ def setup_onnx_file_path(self, multitoken: bool = False):
 
         :return: file path to the processed ONNX file for the engine to compile
         """
-        if multitoken:
-            onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
-                self.model_path,
-                require_configs=True,
-                model_dir_onnx_name=_MODEL_DIR_ONNX_MULTI_TOKEN_NAME,
-            )
-        else:
-            onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
-                self.model_path,
-                require_configs=True,
-                model_dir_onnx_name = "model_fixed.onnx" if self.engine_type == "onnxruntime" else "model.onnx",
-            )
-
-        self.config = AutoConfig.from_pretrained(
-            config_path, finetuning_task=self.task if hasattr(self, "task") else None
+        onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
+            self.model_path,
+            require_configs=True,
+            model_dir_onnx_name="model_kv_cache.onnx",
         )
+
+        self.config = AutoConfig.from_pretrained(config_path)
+
+        # So far hardcoding the tokenizer, to be figured out later
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path,
+            "facebook/opt-350m",
             model_max_length=self.sequence_length,
         )
         self.config_path = os.path.join(config_path, "config.json")
-        self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
 
         # overwrite onnx graph to given required input shape
         (
@@ -347,8 +329,8 @@ def setup_onnx_file_path(self, multitoken: bool = False):
             self._temp_model_directory,
         ) = overwrite_transformer_onnx_model_inputs(
             onnx_path,
-            max_length=self.sequence_length,
-            custom_input_overwrite_func=self.overwrite_onnx_model_inputs if not self.engine_type == "onnxruntime" else None,
+            max_length=self.sequence_length if multitoken else 1,
+            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
         )
 
         model = onnx.load_model(onnx_path, load_external_data=False)
@@ -359,13 +341,13 @@ def setup_onnx_file_path(self, multitoken: bool = False):
     def _initialize_kv_cache(self):
         # initialize empty kv cache
         empty_kv_cache_tensor = numpy.zeros(
+            # hard coded for now, we can fetch it automatically
+            # from engine output shapes, but because it overwrites
+            # 16 with 1, not feasible for now
             (
-                self._batch_size,  # batch size
-                self.external_outputs[1]
-                .type.tensor_type.shape.dim[1]
-                .dim_value,  # num heads
-                0 if self.engine_type == "onnxruntime" else self.sequence_length - 1,
-                self.external_outputs[1].type.tensor_type.shape.dim[3].dim_value,
+                16,  # num heads
+                0,
+                64,  # hidden dims
             ),
             dtype=numpy.float32,
         )  # hidden size
@@ -381,7 +363,6 @@ def _assemble_kv_cache(
         self,
         cache_values: List[numpy.ndarray],
         tokens: List[int],
-        prompt_inference=False,
     ) -> Dict[str, numpy.ndarray]:
         # rename the output names to match the names expected
         # in the next autoregressive pass
@@ -392,21 +373,13 @@ def _assemble_kv_cache(
         ]
         kv_cache = dict(zip(cache_keys, cache_values))
 
-        if not self.engine_type == "onnxruntime":
-            for key, val in kv_cache.items():
-                if prompt_inference:
-                    # remove the information about the `new_token` from the cache
-                    val = val[:, :, :-1]
-                    # zero out all the info that does not pertain to the
-                    # "seen" `token` sequence
-                    val[:, :, len(tokens) :] = 0.0
-                    kv_cache[key] = numpy.ascontiguousarray(val)
-
-                else:
-                    # move the information about the `new_token` to the
-                    # end of the valid cache
-                    val[:, :, len(tokens) - 1] = val[:, :, -1]
-                    kv_cache[key] = numpy.ascontiguousarray(val[:, :, :-1])
+        # uncomment if we want to isolate only the
+        # meaningful portion of the cv_cache, probably
+        # not needed in the static inference case,
+        # time will tell
+        # for key, val in kv_cache.items():
+        #     # isolate only the meaningful portion of the cv_cache
+        #     kv_cache[key] = val[:1, : :]
 
         return kv_cache
 
@@ -426,19 +399,22 @@ def overwrite_onnx_model_inputs(
         for external_input in external_inputs:
             if external_input.name == "input_ids":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                external_input.type.tensor_type.shape.dim[1].dim_value = 1
+                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
             elif external_input.name == "attention_mask":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
             elif external_input.name.startswith("past_key_values"):
-                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                external_input.type.tensor_type.shape.dim[1].dim_value = 16
-                external_input.type.tensor_type.shape.dim[2].dim_value = (
-                    sequence_length - 1
-                )
-                external_input.type.tensor_type.shape.dim[3].dim_value = 64
-            else:
+                external_input.type.tensor_type.shape.dim[0].dim_value = 16  # num heads
+                external_input.type.tensor_type.shape.dim[1].dim_value = (
+                    0 if sequence_length != 1 else 256
+                )  # past_sequence_length
+                external_input.type.tensor_type.shape.dim[
+                    2
+                ].dim_value = 64  # hidden dims
+            elif external_input.name.startswith("cache_length"):
                 pass
+            else:
+                raise ValueError(f"Unexpected input name: {external_input.name}")
 
             input_names.append(external_input.name)
         return input_names
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index 41a9b3472f..7ea812f4cc 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -95,8 +95,9 @@ class SupportedTasks:
         ),
     )
 
-    text_generation = namedtuple("text_generation", ["codegen"])(
+    text_generation = namedtuple("text_generation", ["opt", "codegen"])(
         codegen=AliasedTask("codegen", []),
+        opt=AliasedTask("opt", []),
     )
 
     image_classification = namedtuple("image_classification", ["image_classification"])(
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 7db8dd80bd..b072639272 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -175,8 +175,8 @@ def overwrite_transformer_onnx_model_inputs(
     else:
         input_names = []
         for external_input in external_inputs:
-            #external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-            #external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+            # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+            # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
             input_names.append(external_input.name)
 
     # Save modified model

From fb7783876622c5b76554760ed5217cfd6c7d98e9 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Tue, 16 May 2023 19:28:36 -0400
Subject: [PATCH 18/68] fixes to get static pipeline a little further along

---
 src/deepsparse/pipelines/text_generation.py | 2 +-
 src/deepsparse/utils/onnx.py                | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index aeeea8addd..f58a1f3648 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -402,7 +402,7 @@ def overwrite_onnx_model_inputs(
                 external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
             elif external_input.name == "attention_mask":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
+                external_input.type.tensor_type.shape.dim[1].dim_value = 256  # sequence_length
             elif external_input.name.startswith("past_key_values"):
                 external_input.type.tensor_type.shape.dim[0].dim_value = 16  # num heads
                 external_input.type.tensor_type.shape.dim[1].dim_value = (
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 83ead7a5ab..465b14987e 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -213,6 +213,8 @@ def override_onnx_batch_size(
     for external_input in external_inputs:
         if external_input.name == "cache_length":
             continue
+        if external_input.name.startswith("past_key_values"):
+            continue  # TODO: really should set to BS * num_heads, skipping for now for testing
         external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
 
     # Save modified model, this will be cleaned up when context is exited

From 2097463ff5bfdf8423ab40767a35afedc1d768d9 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Wed, 17 May 2023 16:05:00 -0400
Subject: [PATCH 19/68] adjust shapes and slicing to enable static
 autoregressive pass - ISSUE: tokens past the base seq len are repeated

---
 src/deepsparse/pipelines/text_generation.py | 60 ++++++++++++++-------
 src/deepsparse/transformers/helpers.py      | 16 ++++--
 2 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index f58a1f3648..ef80248703 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -264,12 +264,13 @@ def autoregressive_inference(
         new_token = tokens[-1]
 
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, : len(tokens) + 1] = 1
+        attention_mask[:, :len(tokens)] = 1
 
+        cache_length = min(len(tokens) - 1, self.sequence_length - 1)
         engine_inputs = {
             "input_ids": numpy.array([[new_token]]),
             "attention_mask": attention_mask,
-            "cache_length": numpy.array(len(tokens), dtype=numpy.int64),
+            "cache_length": numpy.array(cache_length, dtype=numpy.int64),
         }
 
         engine_inputs.update(kv_cache)
@@ -279,7 +280,7 @@ def autoregressive_inference(
         kv_cache = self._assemble_kv_cache(cache_values, tokens)
 
         # Obtain the next token from the logits
-        generated_token = self.generate_token(new_logits[0, -1, :])
+        generated_token = self.generate_token(new_logits[0, 0, :])
 
         return generated_token, kv_cache
 
@@ -329,8 +330,11 @@ def setup_onnx_file_path(self, multitoken: bool = False):
             self._temp_model_directory,
         ) = overwrite_transformer_onnx_model_inputs(
             onnx_path,
-            max_length=self.sequence_length if multitoken else 1,
+            max_length=self.sequence_length,
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
+            custom_input_overwrite_func_kwargs=dict(
+                multitoken=multitoken,
+            ),
         )
 
         model = onnx.load_model(onnx_path, load_external_data=False)
@@ -364,6 +368,25 @@ def _assemble_kv_cache(
         cache_values: List[numpy.ndarray],
         tokens: List[int],
     ) -> Dict[str, numpy.ndarray]:
+
+        if cache_values[0].shape[1] > self.sequence_length - 1:
+            # adjust cache to proper shape
+            if len(tokens) > self.sequence_length - 1:
+                # all values in cache are from non-pad tokens
+                # pop from front
+                # idxs = [idx for idx in range(self.sequence_length) if idx != 9]
+                # cache_values = [
+                #     cache_value[:, idxs, :] for cache_value in cache_values
+                # ]
+                cache_values = [
+                    cache_value[:, 1:, :] for cache_value in cache_values
+                ]
+            else:
+                # some tokens are padded - pop from back
+                cache_values = [
+                    cache_value[:, :-1, :] for cache_value in cache_values
+                ]
+
         # rename the output names to match the names expected
         # in the next autoregressive pass
         cache_keys = [
@@ -373,19 +396,14 @@ def _assemble_kv_cache(
         ]
         kv_cache = dict(zip(cache_keys, cache_values))
 
-        # uncomment if we want to isolate only the
-        # meaningful portion of the cv_cache, probably
-        # not needed in the static inference case,
-        # time will tell
-        # for key, val in kv_cache.items():
-        #     # isolate only the meaningful portion of the cv_cache
-        #     kv_cache[key] = val[:1, : :]
-
         return kv_cache
 
     @staticmethod
     def overwrite_onnx_model_inputs(
-        external_inputs: List[ValueInfoProto], batch_size: int, sequence_length: int
+        external_inputs: List[ValueInfoProto],
+        batch_size: int,
+        sequence_length: int,
+        multitoken: bool,
     ) -> List[str]:
         """
         Overwrite the input shape of the onnx model.
@@ -393,21 +411,27 @@ def overwrite_onnx_model_inputs(
         :param external_inputs: the external inputs of the onnx model
         :param batch_size: the batch size of the input
         :param max_length: the max length of the input
+        :param multitoken: true if model is to be run with seq len > 1
         :return: the input names of the onnx model
         """
         input_names = []
         for external_input in external_inputs:
             if external_input.name == "input_ids":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
+                external_input.type.tensor_type.shape.dim[1].dim_value = (
+                    sequence_length if multitoken else 1
+                )
             elif external_input.name == "attention_mask":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                external_input.type.tensor_type.shape.dim[1].dim_value = 256  # sequence_length
+                # even in single token cached runs, full attention mask
+                # will be provided
+                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
             elif external_input.name.startswith("past_key_values"):
-                external_input.type.tensor_type.shape.dim[0].dim_value = 16  # num heads
+                external_input.type.tensor_type.shape.dim[0].dim_value = 16  # n_heads
+                # no cache for multitoken runs, otherwise max cache len is max len - 1
                 external_input.type.tensor_type.shape.dim[1].dim_value = (
-                    0 if sequence_length != 1 else 256
-                )  # past_sequence_length
+                    0 if multitoken else sequence_length - 1
+                )
                 external_input.type.tensor_type.shape.dim[
                     2
                 ].dim_value = 64  # hidden dims
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index b072639272..98592196d5 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -21,7 +21,7 @@
 import re
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy
 import onnx
@@ -140,6 +140,7 @@ def overwrite_transformer_onnx_model_inputs(
     output_path: Optional[str] = None,
     load_external_data: bool = True,
     custom_input_overwrite_func: Optional[Callable] = None,
+    custom_input_overwrite_func_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
     Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
@@ -155,9 +156,10 @@ def overwrite_transformer_onnx_model_inputs(
     :param load_external_data: if True, external data will be loaded into the model
         graph. If False, external data will not be loaded and the model will be
         saved without external data
-    :custom_input_overwrite_func: if provided, this function will be called instead
-        of the default input overwrite function. This function should take in a list
-        of external inputs and return a list of the overwritten input names
+    :param custom_input_overwrite_func: if provided, this function will be called
+        instead of the default input overwrite function. This function should take
+         in a list of external inputs and return a list of the overwritten input names
+    :param custom_input_overwrite_func_kwargs: kwargs for the custom overwrite function
     :return: if no output path, a tuple of the saved path to the model, list of
         model input names, and reference to the tempfile object will be returned
         otherwise, only the model input names will be returned
@@ -169,8 +171,12 @@ def overwrite_transformer_onnx_model_inputs(
         inp for inp in model.graph.input if inp.name not in initializer_input_names
     ]
     if custom_input_overwrite_func is not None:
+        custom_input_overwrite_func_kwargs = custom_input_overwrite_func_kwargs or {}
         input_names = custom_input_overwrite_func(
-            external_inputs, batch_size, max_length
+            external_inputs,
+            batch_size,
+            max_length,
+            **custom_input_overwrite_func_kwargs,
         )
     else:
         input_names = []

From 5eb10a94908da552f51169d78dcf18a49b32d011 Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Thu, 18 May 2023 00:22:34 -0400
Subject: [PATCH 20/68] migrate from cache_length to positions input

---
 src/deepsparse/pipelines/text_generation.py | 72 +++++++++++++--------
 1 file changed, 45 insertions(+), 27 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index ef80248703..d7d3142fb1 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -162,9 +162,13 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         )
 
         kv_cache = self._initialize_kv_cache()
-        cache_length = {"cache_length": numpy.array(0, dtype=numpy.int64)}
 
-        input_tokens = {**input_tokens, **kv_cache, **cache_length}
+        attention_mask = input_tokens["attention_mask"]
+        positions = attention_mask.cumsum(1) * attention_mask
+        positions -= 1  # zero index
+        positions_input = dict(positions=positions)
+
+        input_tokens = {**input_tokens, **kv_cache, **positions_input}
         engine_input = self.tokens_to_engine_input(input_tokens)
 
         return engine_input
@@ -193,6 +197,7 @@ def engine_forward(
         """
         # run the prompt through
         tokens, kv_cache = self.prompt_inference(engine_inputs)
+        num_prompt_tokens = len(tokens) - 1
 
         # create the generated output
         # TODO: Apply sliding window logic
@@ -204,7 +209,9 @@ def engine_forward(
         generated = [tokens[-1]]
 
         while len(generated) < max_tokens:
-            gen_token, kv_cache = self.autoregressive_inference(tokens, kv_cache)
+            gen_token, kv_cache = self.autoregressive_inference(
+                tokens, kv_cache, num_prompt_tokens
+            )
             tokens.append(gen_token)
             generated.append(gen_token)
 
@@ -241,7 +248,9 @@ def prompt_inference(
         else:
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
-            kv_cache = self._assemble_kv_cache(cache_values, tokens)
+            kv_cache = self._assemble_kv_cache(
+                cache_values, tokens, len(tokens) - 1
+            )
             new_token = self.generate_token(logits[0, : len(tokens) + 1])
 
         tokens.append(new_token)
@@ -249,7 +258,10 @@ def prompt_inference(
         return tokens, kv_cache
 
     def autoregressive_inference(
-        self, tokens: List[int], kv_cache: Dict[str, numpy.ndarray]
+        self,
+        tokens: List[int],
+        kv_cache: Dict[str, numpy.ndarray],
+        num_prompt_tokens: int,
     ) -> Tuple[int, Dict[str, numpy.ndarray]]:
         """
         An inference run that processes the last token and the kv cache to
@@ -257,27 +269,37 @@ def autoregressive_inference(
 
         :param tokens: The current context (prompt + generated tokens so far)
         :param kv_cache: The key-value cache from the previous inference run
+        :param num_prompt_tokens: number of tokens in the initial prompt
         :return:
             - the list of prompt tokens plus the new, generated token
             - the kv cache that was populated during the inference
         """
         new_token = tokens[-1]
+        num_generated_tokens = len(tokens) - num_prompt_tokens
 
+        # due to right hand concatenation, attention mask is:
+        # 1s for length of prompt + 1s starting from RHS for generated tokens + 1
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, :len(tokens)] = 1
+        attention_mask[:, :num_prompt_tokens + 1] = 1  # +1 because
+        # OPT adds an initial pad
+        # fill in generated tokens from RHS
+        attention_mask[:, -(min(num_generated_tokens, self.sequence_length)):] = 1
 
-        cache_length = min(len(tokens) - 1, self.sequence_length - 1)
+        # the position of the token is the number of tokens - 1 (zero indexed)
+        positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
         engine_inputs = {
             "input_ids": numpy.array([[new_token]]),
             "attention_mask": attention_mask,
-            "cache_length": numpy.array(cache_length, dtype=numpy.int64),
+            "positions": positions,
         }
 
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
         new_logits, *cache_values = self.engine(engine_inputs)
-        kv_cache = self._assemble_kv_cache(cache_values, tokens)
+        kv_cache = self._assemble_kv_cache(
+            cache_values, tokens, num_prompt_tokens
+        )
 
         # Obtain the next token from the logits
         generated_token = self.generate_token(new_logits[0, 0, :])
@@ -367,25 +389,23 @@ def _assemble_kv_cache(
         self,
         cache_values: List[numpy.ndarray],
         tokens: List[int],
+        num_prompt_tokens: int,
     ) -> Dict[str, numpy.ndarray]:
-
-        if cache_values[0].shape[1] > self.sequence_length - 1:
-            # adjust cache to proper shape
+        # first, trim the output cache (seq_len) to input cache shape (seq_len - 1)
+        for idx, cache_value in enumerate(cache_values):
             if len(tokens) > self.sequence_length - 1:
-                # all values in cache are from non-pad tokens
-                # pop from front
-                # idxs = [idx for idx in range(self.sequence_length) if idx != 9]
-                # cache_values = [
-                #     cache_value[:, idxs, :] for cache_value in cache_values
-                # ]
-                cache_values = [
-                    cache_value[:, 1:, :] for cache_value in cache_values
-                ]
+                # all values in cache are from non-pad tokens, pop from front
+                cache_values[idx] = cache_value[:, 1:, :]
             else:
-                # some tokens are padded - pop from back
-                cache_values = [
-                    cache_value[:, :-1, :] for cache_value in cache_values
+                # remove the cache key/value immediately after the prompt since this
+                # is where the last padded value will go
+                idxs_to_keep = [
+                    idx
+                    for idx in range(self.sequence_length)
+                    if idx != num_prompt_tokens + 2
+                    # adding +1 is OPT specific since they always add an extra token
                 ]
+                cache_values[idx] = cache_value[:, idxs_to_keep, :]
 
         # rename the output names to match the names expected
         # in the next autoregressive pass
@@ -416,7 +436,7 @@ def overwrite_onnx_model_inputs(
         """
         input_names = []
         for external_input in external_inputs:
-            if external_input.name == "input_ids":
+            if external_input.name in ["input_ids", "positions"]:
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = (
                     sequence_length if multitoken else 1
@@ -435,8 +455,6 @@ def overwrite_onnx_model_inputs(
                 external_input.type.tensor_type.shape.dim[
                     2
                 ].dim_value = 64  # hidden dims
-            elif external_input.name.startswith("cache_length"):
-                pass
             else:
                 raise ValueError(f"Unexpected input name: {external_input.name}")
 

From 9213f2968fd5acd655eba9553f1a9cc18797da8d Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 18 May 2023 17:11:32 +0000
Subject: [PATCH 21/68] got if working for multitoken + single token scenario

---
 src/deepsparse/pipelines/text_generation.py | 42 +++++++++++++--------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index d7d3142fb1..6a224f095e 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -161,7 +161,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             padding="max_length",
         )
 
-        kv_cache = self._initialize_kv_cache()
+        kv_cache = self._initialize_kv_cache(length = 0)
 
         attention_mask = input_tokens["attention_mask"]
         positions = attention_mask.cumsum(1) * attention_mask
@@ -233,7 +233,7 @@ def prompt_inference(
             - the list of prompt tokens plus the new, generated token
             - the kv cache that was populated during the inference
         """
-        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id]
+        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id] # [pad_token] + correct_tokens + [pad_tokens] -> [pad_token] + correct_tokens
         new_token = None
 
         if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
@@ -243,14 +243,12 @@ def prompt_inference(
             for token in tokens:
                 run_tokens.append(token)
                 new_token, kv_cache = self.autoregressive_inference(
-                    run_tokens, kv_cache
+                    run_tokens, kv_cache, num_prompt_tokens=0
                 )
         else:
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
-            kv_cache = self._assemble_kv_cache(
-                cache_values, tokens, len(tokens) - 1
-            )
+            kv_cache = self._assemble_kv_cache(cache_values, tokens, len(tokens) - 1)
             new_token = self.generate_token(logits[0, : len(tokens) + 1])
 
         tokens.append(new_token)
@@ -280,10 +278,10 @@ def autoregressive_inference(
         # due to right hand concatenation, attention mask is:
         # 1s for length of prompt + 1s starting from RHS for generated tokens + 1
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, :num_prompt_tokens + 1] = 1  # +1 because
+        attention_mask[:, : num_prompt_tokens + 1] = 1  # +1 because
         # OPT adds an initial pad
         # fill in generated tokens from RHS
-        attention_mask[:, -(min(num_generated_tokens, self.sequence_length)):] = 1
+        attention_mask[:, -(min(num_generated_tokens, self.sequence_length)) :] = 1
 
         # the position of the token is the number of tokens - 1 (zero indexed)
         positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
@@ -292,14 +290,13 @@ def autoregressive_inference(
             "attention_mask": attention_mask,
             "positions": positions,
         }
+        kv_cache = self._initialize_kv_cache(length = self.sequence_length - 1) if kv_cache == {} else kv_cache
 
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
         new_logits, *cache_values = self.engine(engine_inputs)
-        kv_cache = self._assemble_kv_cache(
-            cache_values, tokens, num_prompt_tokens
-        )
+        kv_cache = self._assemble_kv_cache(cache_values, tokens, num_prompt_tokens)
 
         # Obtain the next token from the logits
         generated_token = self.generate_token(new_logits[0, 0, :])
@@ -364,7 +361,8 @@ def setup_onnx_file_path(self, multitoken: bool = False):
 
         return onnx_path
 
-    def _initialize_kv_cache(self):
+    def _initialize_kv_cache(self, length: int) -> Dict[str, numpy.ndarray]:
+
         # initialize empty kv cache
         empty_kv_cache_tensor = numpy.zeros(
             # hard coded for now, we can fetch it automatically
@@ -372,7 +370,7 @@ def _initialize_kv_cache(self):
             # 16 with 1, not feasible for now
             (
                 16,  # num heads
-                0,
+                length,
                 64,  # hidden dims
             ),
             dtype=numpy.float32,
@@ -393,9 +391,21 @@ def _assemble_kv_cache(
     ) -> Dict[str, numpy.ndarray]:
         # first, trim the output cache (seq_len) to input cache shape (seq_len - 1)
         for idx, cache_value in enumerate(cache_values):
-            if len(tokens) > self.sequence_length - 1:
+            if len(tokens) + 1 > self.sequence_length - 1:
                 # all values in cache are from non-pad tokens, pop from front
-                cache_values[idx] = cache_value[:, 1:, :]
+                # cache_values[idx] = cache_value[:, 1:, :]
+                # assuming first token is always SOS token keep it and only
+                # remove the first token (not zeroeth) in first dim
+                idxs_to_keep = [idx for idx in range(self.sequence_length) if idx != 1] # double check if SOS needed
+
+            elif len(tokens) + 1 == self.sequence_length - 1:
+                # if we cannot fit more cache values, then we need to remove the
+                # last "empty" cache that remains empty at the `num_prompt_tokens` +1
+                idxs_to_keep = [
+                    idx
+                    for idx in range(self.sequence_length)
+                    if idx != num_prompt_tokens + 1
+                ]
             else:
                 # remove the cache key/value immediately after the prompt since this
                 # is where the last padded value will go
@@ -405,7 +415,7 @@ def _assemble_kv_cache(
                     if idx != num_prompt_tokens + 2
                     # adding +1 is OPT specific since they always add an extra token
                 ]
-                cache_values[idx] = cache_value[:, idxs_to_keep, :]
+            cache_values[idx] = cache_value[:, idxs_to_keep, :]
 
         # rename the output names to match the names expected
         # in the next autoregressive pass

From d9af004a0bcd975d3056b7dc4c2ee326d0ba5034 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 19 May 2023 12:20:37 +0000
Subject: [PATCH 22/68] cleanup the pipeline

---
 src/deepsparse/pipelines/text_generation.py | 238 +++++++++++++-------
 src/deepsparse/transformers/helpers.py      |  19 +-
 2 files changed, 171 insertions(+), 86 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 6a224f095e..62ec4aaecd 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -16,7 +16,6 @@
 from typing import Dict, List, Optional, Tuple, Type
 
 import numpy
-import onnx
 from onnx import ValueInfoProto
 from pydantic import BaseModel, Field
 from transformers import AutoConfig, AutoTokenizer
@@ -29,8 +28,8 @@
 from deepsparse.transformers.pipelines import TransformersPipeline
 
 
-# TODO: to be deprecated after Sage's changes, we will only need a single model
-_MODEL_DIR_ONNX_MULTI_TOKEN_NAME = "decoder_model.onnx"
+OPT_CACHE_HIDDEN_DIM = 64
+
 
 __all__ = ["TextGenerationPipeline"]
 
@@ -74,7 +73,7 @@ class TextGenerationPipeline(TransformersPipeline):
     :param max_generated_tokens: the maximum number of tokens to generate
         given the input sequence. If None, the model will generate
         tokens until the end of the sequence is reached.
-        Otherwise it will generate up to the maximum number of tokens or end of
+        Otherwise, it will generate up to the maximum number of tokens or end of
         sequence is reached.
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
@@ -96,17 +95,20 @@ def __init__(
         self.sampling_temperature = sampling_temperature
         self.max_generated_tokens = max_generated_tokens
         self.prompt_batch_threshold = prompt_batch_threshold
-        # when we are done with the static inference in ORT,
-        # set support_kv_cache = True
+
         self.engine = Pipeline.create_engine(
             self.onnx_file_path,
             self.engine_type,
             self.engine_args,
             self.context,
+            support_kv_cache=True,
         )
 
-        self.onnx_multitoken_path = self.setup_onnx_file_path(multitoken=True)
         # initialize the auxiliary multitoken engine
+        (
+            self.onnx_multitoken_path,
+            self._temp_model_directory,
+        ) = self._setup_onnx_multitoken_file_path()
         self.multitoken_engine = Pipeline.create_engine(
             self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
         )
@@ -161,9 +163,10 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             padding="max_length",
         )
 
-        kv_cache = self._initialize_kv_cache(length = 0)
+        kv_cache = self._initialize_kv_cache(length=0)
 
         attention_mask = input_tokens["attention_mask"]
+
         positions = attention_mask.cumsum(1) * attention_mask
         positions -= 1  # zero index
         positions_input = dict(positions=positions)
@@ -200,7 +203,6 @@ def engine_forward(
         num_prompt_tokens = len(tokens) - 1
 
         # create the generated output
-        # TODO: Apply sliding window logic
         max_tokens = (
             self.max_generated_tokens
             if self.max_generated_tokens and self.max_generated_tokens > 0
@@ -233,7 +235,11 @@ def prompt_inference(
             - the list of prompt tokens plus the new, generated token
             - the kv cache that was populated during the inference
         """
-        tokens = [t for t in engine_inputs[0][0] if t != self.tokenizer.pad_token_id] # [pad_token] + correct_tokens + [pad_tokens] -> [pad_token] + correct_tokens
+        tokens = engine_inputs[0][0].tolist()
+        # remove trailing padding
+        while tokens[-1] == self.tokenizer.pad_token_id:
+            tokens.pop()
+
         new_token = None
 
         if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
@@ -248,8 +254,8 @@ def prompt_inference(
         else:
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
-            kv_cache = self._assemble_kv_cache(cache_values, tokens, len(tokens) - 1)
-            new_token = self.generate_token(logits[0, : len(tokens) + 1])
+            kv_cache = self.assemble_kv_cache(cache_values, tokens, len(tokens) - 1)
+            new_token = self.generate_token(logits[0, len(tokens) - 1])
 
         tokens.append(new_token)
 
@@ -269,7 +275,7 @@ def autoregressive_inference(
         :param kv_cache: The key-value cache from the previous inference run
         :param num_prompt_tokens: number of tokens in the initial prompt
         :return:
-            - the list of prompt tokens plus the new, generated token
+            - the new, generated token
             - the kv cache that was populated during the inference
         """
         new_token = tokens[-1]
@@ -278,9 +284,7 @@ def autoregressive_inference(
         # due to right hand concatenation, attention mask is:
         # 1s for length of prompt + 1s starting from RHS for generated tokens + 1
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, : num_prompt_tokens + 1] = 1  # +1 because
-        # OPT adds an initial pad
-        # fill in generated tokens from RHS
+        attention_mask[:, :num_prompt_tokens] = 1
         attention_mask[:, -(min(num_generated_tokens, self.sequence_length)) :] = 1
 
         # the position of the token is the number of tokens - 1 (zero indexed)
@@ -290,13 +294,19 @@ def autoregressive_inference(
             "attention_mask": attention_mask,
             "positions": positions,
         }
-        kv_cache = self._initialize_kv_cache(length = self.sequence_length - 1) if kv_cache == {} else kv_cache
+        # initialize the kv cache if it is empty
+        # (when the prompt is processed with the single-token engine)
+        kv_cache = (
+            self._initialize_kv_cache(length=self.sequence_length - 1)
+            if kv_cache == {}
+            else kv_cache
+        )
 
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
         new_logits, *cache_values = self.engine(engine_inputs)
-        kv_cache = self._assemble_kv_cache(cache_values, tokens, num_prompt_tokens)
+        kv_cache = self.assemble_kv_cache(cache_values, tokens, num_prompt_tokens)
 
         # Obtain the next token from the logits
         generated_token = self.generate_token(new_logits[0, 0, :])
@@ -307,7 +317,7 @@ def generate_token(self, logits: numpy.ndarray) -> int:
         """
         Samples a token from the logits using the sampling temperature.
 
-        :param logits: the logits from the model
+        :param logits: the logits from the model with shape (vocab_size,)
 
         :return: the sampled token
         """
@@ -320,7 +330,7 @@ def generate_token(self, logits: numpy.ndarray) -> int:
 
         return numpy.random.choice(len(probs), p=probs)
 
-    def setup_onnx_file_path(self, multitoken: bool = False):
+    def setup_onnx_file_path(self) -> str:
         """
         Parses ONNX, tokenizer, and config file paths from the given `model_path`.
         Supports sparsezoo stubs
@@ -330,19 +340,17 @@ def setup_onnx_file_path(self, multitoken: bool = False):
         onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
             self.model_path,
             require_configs=True,
-            model_dir_onnx_name="model_kv_cache.onnx",
         )
 
         self.config = AutoConfig.from_pretrained(config_path)
 
-        # So far hardcoding the tokenizer, to be figured out later
+        # So far hard-coding the tokenizer, to be figured out later
         self.tokenizer = AutoTokenizer.from_pretrained(
             "facebook/opt-350m",
             model_max_length=self.sequence_length,
         )
         self.config_path = os.path.join(config_path, "config.json")
 
-        # overwrite onnx graph to given required input shape
         (
             onnx_path,
             self.onnx_input_names,
@@ -352,73 +360,105 @@ def setup_onnx_file_path(self, multitoken: bool = False):
             max_length=self.sequence_length,
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
-                multitoken=multitoken,
+                multitoken=False,
+                num_attention_heads=self.config.num_attention_heads,
+                hidden_dims=OPT_CACHE_HIDDEN_DIM,
             ),
         )
 
-        model = onnx.load_model(onnx_path, load_external_data=False)
-        self.external_outputs = [out for out in model.graph.output]
-
         return onnx_path
 
     def _initialize_kv_cache(self, length: int) -> Dict[str, numpy.ndarray]:
-
-        # initialize empty kv cache
+        # initialize empty kv cache of size
+        # (num_attention_heads, length, hidden_dims)
         empty_kv_cache_tensor = numpy.zeros(
-            # hard coded for now, we can fetch it automatically
-            # from engine output shapes, but because it overwrites
-            # 16 with 1, not feasible for now
             (
-                16,  # num heads
+                self.config.num_attention_heads,
                 length,
-                64,  # hidden dims
+                OPT_CACHE_HIDDEN_DIM,
             ),
             dtype=numpy.float32,
-        )  # hidden size
+        )
 
         cache_keys = [
-            output.name.replace("present", "past_key_values")
-            for output in self.external_outputs
-            if output.name.startswith("present")
+            output_name.replace("present", "past_key_values")
+            for output_name in self.engine.output_names
+            if output_name.startswith("present")
         ]
         return {key: empty_kv_cache_tensor for key in cache_keys}
 
-    def _assemble_kv_cache(
+    def assemble_kv_cache(
         self,
         cache_values: List[numpy.ndarray],
         tokens: List[int],
         num_prompt_tokens: int,
+        consider_sos_token: bool = False,
     ) -> Dict[str, numpy.ndarray]:
-        # first, trim the output cache (seq_len) to input cache shape (seq_len - 1)
+        """
+        Restructure the kv cache values from the engine output, so
+        that it can be passed to the engine in the next inference run.
+
+        By default, every time this function is called, the cache key/value,
+        that immediately follows the cache keys/values corresponding to the
+        prompt tokens, is removed.
+        Example:
+        ```
+        (`X` are the padded cache entries)
+        attention_mask = [1, 1, 1, 0, 0, 0, 0, 1, 1]
+        cache_entries = [1, 2, 3, X, X, X, 4, 5]
+        ([1,2,3] entries correspond to the prompt tokens)
+        ([4,5] entries correspond to the tokens generated in the previous inference run)
+        assemble_kv_cache()
+        cache_entries -> [1, 2, 3, X, X, 4, 5]
+        ```
+
+        Then, once the cache is almost full -> only one padded cache entry remains,
+        we need to remove it, so that no padded entries are present in the cache.
+        Example:
+        ```
+        attention_mask = [1, 1, 1, 0, 1, 1, 1, 1]
+        cache_entries = [1, 2, 3, X, 4, 5, 6, 7]
+        assemble_kv_cache()
+        cache_entries -> [1, 2, 3, 4, 5, 6, 7]
+        ```
+
+        Finally, where all values in cache are non-padded, we pop from the front
+        of the cache (remove the oldest history from the cache):
+            - if the SOS (Start Of Sequence) token is not considered,
+            we pop from the 0th index
+            Example:
+            ```
+            attention_mask = [1, 1, 1, 1, 1, 1, 1, 1]
+            cache_entries = [1, 2, 3, 4, 5, 6, 7, 8]
+            assemble_kv_cache()
+            cache_entries -> [2, 3, 4, 5, 6, 7, 8]
+            ```
+
+            - if the SOS token is considered, we pop from the 1st index
+            (because the 0th index is the SOS token, which we want to keep)
+            Example:
+            ```
+            attention_mask = [1, 1, 1, 1, 1, 1, 1, 1]
+            cache_entries = [1, 2, 3, 4, 5, 6, 7, 8]
+            assemble_kv_cache()
+            cache_entries -> [1, 3, 4, 5, 6, 7, 8]
+            ```
+        :param cache_values: the cache values from the engine output
+        :param tokens: the tokens from the previous inference run
+        :param num_prompt_tokens: number of tokens in the initial prompt
+        :param consider_sos_token: whether to consider the SOS token in the cache
+        :return kv_cache: the restructured cache values
+        """
         for idx, cache_value in enumerate(cache_values):
-            if len(tokens) + 1 > self.sequence_length - 1:
-                # all values in cache are from non-pad tokens, pop from front
-                # cache_values[idx] = cache_value[:, 1:, :]
-                # assuming first token is always SOS token keep it and only
-                # remove the first token (not zeroeth) in first dim
-                idxs_to_keep = [idx for idx in range(self.sequence_length) if idx != 1] # double check if SOS needed
-
-            elif len(tokens) + 1 == self.sequence_length - 1:
-                # if we cannot fit more cache values, then we need to remove the
-                # last "empty" cache that remains empty at the `num_prompt_tokens` +1
-                idxs_to_keep = [
-                    idx
-                    for idx in range(self.sequence_length)
-                    if idx != num_prompt_tokens + 1
-                ]
+            if len(tokens) > self.sequence_length - 1:
+                idx_to_remove = int(not consider_sos_token)
+            elif len(tokens) == self.sequence_length - 1:
+                idx_to_remove = num_prompt_tokens
             else:
-                # remove the cache key/value immediately after the prompt since this
-                # is where the last padded value will go
-                idxs_to_keep = [
-                    idx
-                    for idx in range(self.sequence_length)
-                    if idx != num_prompt_tokens + 2
-                    # adding +1 is OPT specific since they always add an extra token
-                ]
-            cache_values[idx] = cache_value[:, idxs_to_keep, :]
-
-        # rename the output names to match the names expected
-        # in the next autoregressive pass
+                idx_to_remove = num_prompt_tokens + 1
+
+            cache_values[idx] = numpy.delete(cache_value, idx_to_remove, 1)
+
         cache_keys = [
             name.replace("present", "past_key_values")
             for name in self.engine.output_names
@@ -433,15 +473,29 @@ def overwrite_onnx_model_inputs(
         external_inputs: List[ValueInfoProto],
         batch_size: int,
         sequence_length: int,
-        multitoken: bool,
+        num_attention_heads: int,
+        hidden_dims: int,
+        multitoken: bool = True,
     ) -> List[str]:
         """
-        Overwrite the input shape of the onnx model.
-
-        :param external_inputs: the external inputs of the onnx model
-        :param batch_size: the batch size of the input
-        :param max_length: the max length of the input
-        :param multitoken: true if model is to be run with seq len > 1
+        Overwrite the input shape of the onnx model. This function
+        is particular for the model with inputs:
+            - input_ids
+            - attention_mask
+            - positions
+            - past_key_values (x N)
+
+        :param external_inputs: The external inputs of the onnx model
+        :param batch_size: The batch size of the input
+        :param sequence_length: The sequence length of the input
+        :param num_attention_heads: The number of attention heads
+            of the model (required to set the shape of the kv_cache)
+        :param hidden_dims: The hidden dimensions of the model
+            (required to set the shape of the kv_cache)
+        :param multitoken: A boolean flag that indicates whether
+            we are overwriting inputs to the model for multi-token
+            inference (sequence_len > 1) or single token inference
+            (sequence_len = 1).
         :return: the input names of the onnx model
         """
         input_names = []
@@ -453,20 +507,40 @@ def overwrite_onnx_model_inputs(
                 )
             elif external_input.name == "attention_mask":
                 external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                # even in single token cached runs, full attention mask
-                # will be provided
+                # regardless of multi-token or not,
+                # we always provide full attention mask
                 external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
             elif external_input.name.startswith("past_key_values"):
-                external_input.type.tensor_type.shape.dim[0].dim_value = 16  # n_heads
-                # no cache for multitoken runs, otherwise max cache len is max len - 1
+                external_input.type.tensor_type.shape.dim[
+                    0
+                ].dim_value = num_attention_heads
+                # empty cache for multi-token runs,
+                # otherwise max cache len is max len - 1
                 external_input.type.tensor_type.shape.dim[1].dim_value = (
                     0 if multitoken else sequence_length - 1
                 )
-                external_input.type.tensor_type.shape.dim[
-                    2
-                ].dim_value = 64  # hidden dims
+                external_input.type.tensor_type.shape.dim[2].dim_value = hidden_dims
             else:
-                raise ValueError(f"Unexpected input name: {external_input.name}")
+                raise ValueError(
+                    f"Unexpected external input name: {external_input.name}"
+                )
 
             input_names.append(external_input.name)
         return input_names
+
+    def _setup_onnx_multitoken_file_path(self):
+        (
+            onnx_multitoken_file_path,
+            _,
+            temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            self.onnx_file_path,
+            max_length=self.sequence_length,
+            load_external_data=False,
+            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
+            custom_input_overwrite_func_kwargs=dict(
+                num_attention_heads=self.config.num_attention_heads,
+                hidden_dims=OPT_CACHE_HIDDEN_DIM,
+            ),
+        )
+        return onnx_multitoken_file_path, temp_model_directory
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 98592196d5..5b2ffcfc5e 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -181,17 +181,28 @@ def overwrite_transformer_onnx_model_inputs(
     else:
         input_names = []
         for external_input in external_inputs:
-            # external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-            # external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+            external_input.type.tensor_type.shape.dim[1].dim_value = max_length
             input_names.append(external_input.name)
 
+    model_exceeds_protobuf_limit = model.ByteSize() > onnx.checker.MAXIMUM_PROTOBUF
+
     # Save modified model
     if output_path is None:
         tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-        save_onnx(model, tmp_file.name)
+        save_onnx(
+            model, tmp_file.name
+        ) if not model_exceeds_protobuf_limit else save_onnx(
+            model, tmp_file.name, external_data_file=NamedTemporaryFile()
+        )
         return tmp_file.name, input_names, tmp_file
     else:
-        save_onnx(model, output_path)
+        save_onnx(
+            model, output_path
+        ) if not model_exceeds_protobuf_limit else save_onnx(
+            model, output_path, external_data_file=NamedTemporaryFile()
+        )
+
         return input_names
 
 

From 476f25dc50b298d04cb7f0b2d3cda7a1e9a7e138 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 19 May 2023 12:32:46 +0000
Subject: [PATCH 23/68] further cleanup post merge

---
 src/deepsparse/pipelines/text_generation.py       | 11 ++++++-----
 src/deepsparse/transformers/helpers.py            | 10 ++++------
 src/deepsparse/transformers/pipelines/pipeline.py | 14 +++++---------
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 62ec4aaecd..80b6971482 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -528,19 +528,20 @@ def overwrite_onnx_model_inputs(
             input_names.append(external_input.name)
         return input_names
 
-    def _setup_onnx_multitoken_file_path(self):
+    def _setup_onnx_multitoken_file_path(self) -> str:
         (
-            onnx_multitoken_file_path,
+            onnx_path,
             _,
-            temp_model_directory,
+            _temp_model_directory,
         ) = overwrite_transformer_onnx_model_inputs(
             self.onnx_file_path,
             max_length=self.sequence_length,
-            load_external_data=False,
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
+                multitoken=True,
                 num_attention_heads=self.config.num_attention_heads,
                 hidden_dims=OPT_CACHE_HIDDEN_DIM,
             ),
         )
-        return onnx_multitoken_file_path, temp_model_directory
+
+        return onnx_path, _temp_model_directory
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 5b2ffcfc5e..270f56d9c8 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -52,7 +52,6 @@
 def get_onnx_path_and_configs(
     model_path: str,
     require_configs: bool = False,
-    model_dir_onnx_name: str = _MODEL_DIR_ONNX_NAME,
 ) -> Tuple[str, Optional[str], Optional[str]]:
     """
     :param model_path: path to onnx file, transformers sparsezoo stub,
@@ -62,7 +61,6 @@ def get_onnx_path_and_configs(
     :param require_configs: if True, model_path must be a directory containing
         `model.onnx`, `config.json`, and `tokenizer.json` files. Will raise
         an exception otherwise
-    :param model_dir_onnx_name: name of onnx file in model directory
     :return: tuple of ONNX file path, parent directory of config file
         if it exists, and parent directory of tokenizer config file if it
         exists. (Parent directories returned instead of absolute path
@@ -77,13 +75,13 @@ def get_onnx_path_and_configs(
     if os.path.isdir(model_path):
         model_files = os.listdir(model_path)
 
-        if model_dir_onnx_name not in model_files:
+        if _MODEL_DIR_ONNX_NAME not in model_files:
             raise ValueError(
-                f"{model_dir_onnx_name} not found in transformers model directory "
+                f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, model_dir_onnx_name)}"
+                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
             )
-        onnx_path = os.path.join(model_path, model_dir_onnx_name)
+        onnx_path = os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
 
         # attempt to read config and tokenizer from sparsezoo-like framework directory
         framework_dir = None
diff --git a/src/deepsparse/transformers/pipelines/pipeline.py b/src/deepsparse/transformers/pipelines/pipeline.py
index 16603b6950..38073e260f 100644
--- a/src/deepsparse/transformers/pipelines/pipeline.py
+++ b/src/deepsparse/transformers/pipelines/pipeline.py
@@ -109,8 +109,7 @@ def setup_onnx_file_path(self) -> str:
             config_path, finetuning_task=self.task if hasattr(self, "task") else None
         )
         self.tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_path,
-            model_max_length=self.sequence_length,
+            tokenizer_path, model_max_length=self.sequence_length
         )
         self.config_path = os.path.join(config_path, "config.json")
         self.tokenizer_config_path = os.path.join(tokenizer_path, "tokenizer.json")
@@ -127,22 +126,19 @@ def setup_onnx_file_path(self) -> str:
         return onnx_path
 
     def tokens_to_engine_input(
-        self,
-        tokens: Mapping[Any, numpy.ndarray],
-        onnx_input_names: Optional[List[str]] = None,
+        self, tokens: Mapping[Any, numpy.ndarray]
     ) -> List[numpy.ndarray]:
         """
         :param tokens: outputs of the pipeline tokenizer
         :return: list of numpy arrays in expected order for model input
         """
-        onnx_input_names = onnx_input_names or self.onnx_input_names
-        if not all(name in tokens for name in onnx_input_names):
+        if not all(name in tokens for name in self.onnx_input_names):
             raise ValueError(
-                f"pipeline expected arrays with names {onnx_input_names}, "
+                f"pipeline expected arrays with names {self.onnx_input_names}, "
                 f"received inputs: {list(tokens.keys())}"
             )
 
-        return [tokens[name] for name in onnx_input_names]
+        return [tokens[name] for name in self.onnx_input_names]
 
     @staticmethod
     def should_bucket(*args, **kwargs) -> bool:

From fab44e4029f118f947cc41e7a901726528fc6f24 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 19 May 2023 13:04:12 +0000
Subject: [PATCH 24/68] Pipeline working for single-token inference only

---
 src/deepsparse/pipelines/text_generation.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 80b6971482..35fc259629 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -251,6 +251,13 @@ def prompt_inference(
                 new_token, kv_cache = self.autoregressive_inference(
                     run_tokens, kv_cache, num_prompt_tokens=0
                 )
+            # move the kv cache values corresponding to the prompt, to the front
+            for key, value in kv_cache.items():
+                prompt_values = value[:, -len(tokens) :, :]
+                padded_values = value[:, : -len(tokens), :]
+                kv_cache[key] = numpy.concatenate(
+                    [prompt_values, padded_values], axis=1
+                )
         else:
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
@@ -289,6 +296,10 @@ def autoregressive_inference(
 
         # the position of the token is the number of tokens - 1 (zero indexed)
         positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
+        if num_prompt_tokens == 0:
+            # no prompt tokens, we are currently processing the prompt
+            positions -= 1
+
         engine_inputs = {
             "input_ids": numpy.array([[new_token]]),
             "attention_mask": attention_mask,

From d454e2f1dd49409e32533a89edeae1fadaf492fb Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 19 May 2023 13:09:38 +0000
Subject: [PATCH 25/68] do not load the onnx model with external files twice

---
 src/deepsparse/pipelines/text_generation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 35fc259629..abfd44fd2a 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -547,6 +547,7 @@ def _setup_onnx_multitoken_file_path(self) -> str:
         ) = overwrite_transformer_onnx_model_inputs(
             self.onnx_file_path,
             max_length=self.sequence_length,
+            load_external_data=False,
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
                 multitoken=True,

From 1613e257037d71e210aedb25c4f9ec6ff416f983 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 19 May 2023 15:55:55 +0000
Subject: [PATCH 26/68] pipeline never redundantly saves the external data +
 more robust tokenizer

---
 src/deepsparse/pipelines/text_generation.py |  4 ++--
 src/deepsparse/transformers/helpers.py      | 14 ++------------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index abfd44fd2a..d6560b4fd2 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -355,9 +355,8 @@ def setup_onnx_file_path(self) -> str:
 
         self.config = AutoConfig.from_pretrained(config_path)
 
-        # So far hard-coding the tokenizer, to be figured out later
         self.tokenizer = AutoTokenizer.from_pretrained(
-            "facebook/opt-350m",
+            self.model_path,
             model_max_length=self.sequence_length,
         )
         self.config_path = os.path.join(config_path, "config.json")
@@ -369,6 +368,7 @@ def setup_onnx_file_path(self) -> str:
         ) = overwrite_transformer_onnx_model_inputs(
             onnx_path,
             max_length=self.sequence_length,
+            load_external_data=False,
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
                 multitoken=False,
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 270f56d9c8..b241a6f3c5 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -183,23 +183,13 @@ def overwrite_transformer_onnx_model_inputs(
             external_input.type.tensor_type.shape.dim[1].dim_value = max_length
             input_names.append(external_input.name)
 
-    model_exceeds_protobuf_limit = model.ByteSize() > onnx.checker.MAXIMUM_PROTOBUF
-
     # Save modified model
     if output_path is None:
         tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-        save_onnx(
-            model, tmp_file.name
-        ) if not model_exceeds_protobuf_limit else save_onnx(
-            model, tmp_file.name, external_data_file=NamedTemporaryFile()
-        )
+        save_onnx(model, tmp_file.name)
         return tmp_file.name, input_names, tmp_file
     else:
-        save_onnx(
-            model, output_path
-        ) if not model_exceeds_protobuf_limit else save_onnx(
-            model, output_path, external_data_file=NamedTemporaryFile()
-        )
+        save_onnx(model, output_path)
 
         return input_names
 

From b61055ca5f0c849b914cbee80ed0cb6d659f0cec Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 19 May 2023 16:36:24 +0000
Subject: [PATCH 27/68] Stop saving tmp files, otherwise the engine looks for
 external files in the wrong place

---
 src/deepsparse/transformers/helpers.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index b241a6f3c5..3e05cd5aaa 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -43,7 +43,7 @@
 
 _LOGGER = get_main_logger()
 
-_MODEL_DIR_ONNX_NAME = "model.onnx"
+_MODEL_DIR_ONNX_NAME = "model_kvcache.onnx"
 _MODEL_DIR_CONFIG_NAME = "config.json"
 _MODEL_DIR_TOKENIZER_NAME = "tokenizer.json"
 _MODEL_DIR_TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
@@ -185,9 +185,8 @@ def overwrite_transformer_onnx_model_inputs(
 
     # Save modified model
     if output_path is None:
-        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-        save_onnx(model, tmp_file.name)
-        return tmp_file.name, input_names, tmp_file
+        save_onnx(model, path)
+        return path, input_names, path
     else:
         save_onnx(model, output_path)
 

From 6ee25fcbfafe5fb971d7ec796a4fe1ebd071fbfd Mon Sep 17 00:00:00 2001
From: Benjamin <ben@neuralmagic.com>
Date: Fri, 19 May 2023 13:58:28 -0400
Subject: [PATCH 28/68] Left pad support

---
 src/deepsparse/pipelines/text_generation.py | 98 ++++++---------------
 1 file changed, 28 insertions(+), 70 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index d6560b4fd2..6d45d9e22f 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -113,6 +113,9 @@ def __init__(
             self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
         )
 
+        # override tokenizer to pad to left
+        self.tokenizer.padding_side = "left"
+
     @staticmethod
     def route_input_to_bucket(
         *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
@@ -156,6 +159,9 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
+        # ensure tokenizer pads to left
+        self.tokenizer.padding_side = "left"
+
         input_tokens = self.tokenizer(
             inputs.sequence,
             return_tensors="np",
@@ -168,7 +174,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         attention_mask = input_tokens["attention_mask"]
 
         positions = attention_mask.cumsum(1) * attention_mask
-        positions -= 1  # zero index
+        positions -= 1  # zero index - TODO: investigate if needed outside OPT
         positions_input = dict(positions=positions)
 
         input_tokens = {**input_tokens, **kv_cache, **positions_input}
@@ -235,10 +241,8 @@ def prompt_inference(
             - the list of prompt tokens plus the new, generated token
             - the kv cache that was populated during the inference
         """
-        tokens = engine_inputs[0][0].tolist()
-        # remove trailing padding
-        while tokens[-1] == self.tokenizer.pad_token_id:
-            tokens.pop()
+        # get tokens by attention mask
+        tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
 
         new_token = None
 
@@ -251,18 +255,11 @@ def prompt_inference(
                 new_token, kv_cache = self.autoregressive_inference(
                     run_tokens, kv_cache, num_prompt_tokens=0
                 )
-            # move the kv cache values corresponding to the prompt, to the front
-            for key, value in kv_cache.items():
-                prompt_values = value[:, -len(tokens) :, :]
-                padded_values = value[:, : -len(tokens), :]
-                kv_cache[key] = numpy.concatenate(
-                    [prompt_values, padded_values], axis=1
-                )
         else:
             # larger prompt size, run through multi-token engine in single pass
             logits, *cache_values = self.multitoken_engine(engine_inputs)
-            kv_cache = self.assemble_kv_cache(cache_values, tokens, len(tokens) - 1)
-            new_token = self.generate_token(logits[0, len(tokens) - 1])
+            kv_cache = self.assemble_kv_cache(cache_values, tokens)
+            new_token = self.generate_token(logits[0, -1])
 
         tokens.append(new_token)
 
@@ -286,13 +283,12 @@ def autoregressive_inference(
             - the kv cache that was populated during the inference
         """
         new_token = tokens[-1]
-        num_generated_tokens = len(tokens) - num_prompt_tokens
 
-        # due to right hand concatenation, attention mask is:
-        # 1s for length of prompt + 1s starting from RHS for generated tokens + 1
+        # padding is added to left, so attention mask is 1s from the
+        # right up to the number of total tokens (prompt + generated)
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        attention_mask[:, :num_prompt_tokens] = 1
-        attention_mask[:, -(min(num_generated_tokens, self.sequence_length)) :] = 1
+        num_tokens_running = min(len(tokens), self.sequence_length)  # cap by seq len
+        attention_mask[:, -num_tokens_running:] = 1
 
         # the position of the token is the number of tokens - 1 (zero indexed)
         positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
@@ -317,7 +313,7 @@ def autoregressive_inference(
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
         new_logits, *cache_values = self.engine(engine_inputs)
-        kv_cache = self.assemble_kv_cache(cache_values, tokens, num_prompt_tokens)
+        kv_cache = self.assemble_kv_cache(cache_values, tokens)
 
         # Obtain the next token from the logits
         generated_token = self.generate_token(new_logits[0, 0, :])
@@ -402,72 +398,34 @@ def assemble_kv_cache(
         self,
         cache_values: List[numpy.ndarray],
         tokens: List[int],
-        num_prompt_tokens: int,
         consider_sos_token: bool = False,
     ) -> Dict[str, numpy.ndarray]:
         """
         Restructure the kv cache values from the engine output, so
         that it can be passed to the engine in the next inference run.
 
-        By default, every time this function is called, the cache key/value,
-        that immediately follows the cache keys/values corresponding to the
-        prompt tokens, is removed.
-        Example:
-        ```
-        (`X` are the padded cache entries)
-        attention_mask = [1, 1, 1, 0, 0, 0, 0, 1, 1]
-        cache_entries = [1, 2, 3, X, X, X, 4, 5]
-        ([1,2,3] entries correspond to the prompt tokens)
-        ([4,5] entries correspond to the tokens generated in the previous inference run)
-        assemble_kv_cache()
-        cache_entries -> [1, 2, 3, X, X, 4, 5]
-        ```
-
-        Then, once the cache is almost full -> only one padded cache entry remains,
-        we need to remove it, so that no padded entries are present in the cache.
-        Example:
-        ```
-        attention_mask = [1, 1, 1, 0, 1, 1, 1, 1]
-        cache_entries = [1, 2, 3, X, 4, 5, 6, 7]
-        assemble_kv_cache()
-        cache_entries -> [1, 2, 3, 4, 5, 6, 7]
-        ```
-
-        Finally, where all values in cache are non-padded, we pop from the front
-        of the cache (remove the oldest history from the cache):
-            - if the SOS (Start Of Sequence) token is not considered,
-            we pop from the 0th index
-            Example:
-            ```
-            attention_mask = [1, 1, 1, 1, 1, 1, 1, 1]
-            cache_entries = [1, 2, 3, 4, 5, 6, 7, 8]
-            assemble_kv_cache()
-            cache_entries -> [2, 3, 4, 5, 6, 7, 8]
-            ```
-
-            - if the SOS token is considered, we pop from the 1st index
-            (because the 0th index is the SOS token, which we want to keep)
-            Example:
-            ```
-            attention_mask = [1, 1, 1, 1, 1, 1, 1, 1]
-            cache_entries = [1, 2, 3, 4, 5, 6, 7, 8]
-            assemble_kv_cache()
-            cache_entries -> [1, 3, 4, 5, 6, 7, 8]
-            ```
+        KV Cache concatenation adds an extra dimension to the output cache
+        which should be deleted
+
+        There are two modes:
+        1. Some values in the cache represent pad tokens, padding is to the left,
+            so the left most cache value is deleted
+        2. The cache is saturated with 'real' tokens, if there is a mandatory
+            start-of-sequence (SOS) token, we delete after this one (idx after 0)
+            otherwise we delete from the left as in (1)
+
         :param cache_values: the cache values from the engine output
         :param tokens: the tokens from the previous inference run
-        :param num_prompt_tokens: number of tokens in the initial prompt
         :param consider_sos_token: whether to consider the SOS token in the cache
         :return kv_cache: the restructured cache values
         """
         for idx, cache_value in enumerate(cache_values):
             if len(tokens) > self.sequence_length - 1:
                 idx_to_remove = int(not consider_sos_token)
-            elif len(tokens) == self.sequence_length - 1:
-                idx_to_remove = num_prompt_tokens
             else:
-                idx_to_remove = num_prompt_tokens + 1
+                idx_to_remove = 0
 
+            # TODO: see if we can do in-place
             cache_values[idx] = numpy.delete(cache_value, idx_to_remove, 1)
 
         cache_keys = [

From 5d3004b4c24473fccdd50bfec352263ad778587b Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 22 May 2023 08:44:42 +0000
Subject: [PATCH 29/68] cleanup

---
 src/deepsparse/pipelines/text_generation.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 6d45d9e22f..7efab22d0f 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -159,9 +159,6 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
 
         self.tokenizer.pad_token = self.tokenizer.eos_token
 
-        # ensure tokenizer pads to left
-        self.tokenizer.padding_side = "left"
-
         input_tokens = self.tokenizer(
             inputs.sequence,
             return_tensors="np",
@@ -289,7 +286,7 @@ def autoregressive_inference(
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
         num_tokens_running = min(len(tokens), self.sequence_length)  # cap by seq len
         attention_mask[:, -num_tokens_running:] = 1
-
+        breakpoint()
         # the position of the token is the number of tokens - 1 (zero indexed)
         positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
         if num_prompt_tokens == 0:
@@ -320,7 +317,7 @@ def autoregressive_inference(
 
         return generated_token, kv_cache
 
-    def generate_token(self, logits: numpy.ndarray) -> int:
+    def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
         """
         Samples a token from the logits using the sampling temperature.
 

From ace6fa5e580e830c48206a199fe104c229ea1ad2 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 22 May 2023 08:47:03 +0000
Subject: [PATCH 30/68] cleanup2

---
 src/deepsparse/pipelines/text_generation.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 7efab22d0f..aabcef746f 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -284,9 +284,8 @@ def autoregressive_inference(
         # padding is added to left, so attention mask is 1s from the
         # right up to the number of total tokens (prompt + generated)
         attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        num_tokens_running = min(len(tokens), self.sequence_length)  # cap by seq len
-        attention_mask[:, -num_tokens_running:] = 1
-        breakpoint()
+        num_tokens_processed = min(len(tokens), self.sequence_length)  # cap by seq len
+        attention_mask[:, -num_tokens_processed:] = 1
         # the position of the token is the number of tokens - 1 (zero indexed)
         positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
         if num_prompt_tokens == 0:
@@ -401,15 +400,17 @@ def assemble_kv_cache(
         Restructure the kv cache values from the engine output, so
         that it can be passed to the engine in the next inference run.
 
-        KV Cache concatenation adds an extra dimension to the output cache
-        which should be deleted
+        KV Cache concatenation adds an extra length dimension to the output 
+        cache, that should be deleted after every inference run.
 
         There are two modes:
-        1. Some values in the cache represent pad tokens, padding is to the left,
-            so the left most cache value is deleted
-        2. The cache is saturated with 'real' tokens, if there is a mandatory
-            start-of-sequence (SOS) token, we delete after this one (idx after 0)
-            otherwise we delete from the left as in (1)
+        1. Some values in the cache represent dummy (pad) tokens, padding is 
+            to the left, so the left-most cache value is deleted
+        2. The cache is saturated with non-dummy (meaningful) tokens:
+            -   if there is a mandatory start-of-sequence (SOS) token, 
+                we delete the left-most cache value that is not a cache
+                corresponding to SOS token.
+            -   otherwise we delete from the left as in (1)
 
         :param cache_values: the cache values from the engine output
         :param tokens: the tokens from the previous inference run

From 388586d100eb26283270f00b6a4de8457ab5b176 Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Wed, 24 May 2023 08:04:26 -0400
Subject: [PATCH 31/68] Add in pipeline timing

---
 src/deepsparse/pipeline.py                  |  27 +++
 src/deepsparse/pipelines/text_generation.py |  14 +-
 src/deepsparse/timing/pipeline_timer.py     | 194 ++++++++++++++++++++
 3 files changed, 232 insertions(+), 3 deletions(-)
 create mode 100644 src/deepsparse/timing/pipeline_timer.py

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 105e828365..be31ab1b51 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -37,6 +37,7 @@
 )
 from deepsparse.tasks import SupportedTasks, dynamic_import_task
 from deepsparse.timing import InferencePhases, Timer
+from deepsparse.timing.pipeline_timer import PipelineTimer
 
 
 __all__ = [
@@ -141,6 +142,7 @@ class PipelineImplementation(Pipeline):
         a path to the logging config, or yaml string representation the logging
         config. If logger provided (in any form), the pipeline will log inference
         metrics to the logger. Default is None
+    :param benchmark: An optional boolean flag that can be used to enable/disable
     """
 
     def __init__(
@@ -155,8 +157,12 @@ def __init__(
         context: Optional[Context] = None,
         executor: Optional[Union[ThreadPoolExecutor, int]] = None,
         logger: Optional[Union[BaseLogger, str]] = None,
+        benchmark: bool = False,
         _delay_engine_initialize: bool = False,  # internal use only
     ):
+        self._benchmark = benchmark
+        self._timer = PipelineTimer(enabled=benchmark, multi_inference=True)
+        self._timer.reset()
         self._model_path_orig = model_path
         self._model_path = model_path
         self._engine_type = engine_type
@@ -218,12 +224,15 @@ def __call__(self, *args, **kwargs) -> BaseModel:
                 "invalid kwarg engine_inputs. engine inputs determined "
                 f"by {self.__class__.__qualname__}.parse_inputs"
             )
+        self._timer.reset()
         timer = Timer()
 
         timer.start(InferencePhases.TOTAL_INFERENCE)
+        self._timer.start_inference_stage(InferencePhases.TOTAL_INFERENCE)
 
         # ------ PREPROCESSING ------
         timer.start(InferencePhases.PRE_PROCESS)
+        self._timer.start_inference_stage(InferencePhases.PRE_PROCESS)
         # parse inputs into input_schema
         pipeline_inputs = self.parse_inputs(*args, **kwargs)
 
@@ -245,6 +254,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         else:
             postprocess_kwargs = {}
         timer.stop(InferencePhases.PRE_PROCESS)
+        self._timer.stop_inference_stage(InferencePhases.PRE_PROCESS)
 
         self.log(
             identifier="engine_inputs",
@@ -260,6 +270,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         # ------ INFERENCE ------
         # split inputs into batches of size `self._batch_size`
         timer.start(InferencePhases.ENGINE_FORWARD)
+        self._timer.start_inference_stage(InferencePhases.ENGINE_FORWARD)
         # Hack to enable inference with `cache_length` argument
         # batches = self.split_engine_inputs(engine_inputs, self._batch_size)
         batches = [engine_inputs]
@@ -270,6 +281,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
         timer.stop(InferencePhases.ENGINE_FORWARD)
+        self._timer.stop_inference_stage(InferencePhases.ENGINE_FORWARD)
 
         self.log(
             identifier=f"{SystemGroups.INFERENCE_DETAILS}/input_batch_size_total",
@@ -294,6 +306,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
 
         # ------ POSTPROCESSING ------
         timer.start(InferencePhases.POST_PROCESS)
+        self._timer.start_inference_stage(InferencePhases.POST_PROCESS)
         pipeline_outputs = self.process_engine_outputs(
             engine_outputs, **postprocess_kwargs
         )
@@ -303,7 +316,9 @@ def __call__(self, *args, **kwargs) -> BaseModel:
                 f"{self.output_schema} found output of type {type(pipeline_outputs)}"
             )
         timer.stop(InferencePhases.POST_PROCESS)
+        self._timer.stop_inference_stage(InferencePhases.POST_PROCESS)
         timer.stop(InferencePhases.TOTAL_INFERENCE)
+        self._timer.stop_inference_stage(InferencePhases.TOTAL_INFERENCE)
 
         self.log(
             identifier="pipeline_outputs",
@@ -737,6 +752,18 @@ def engine_type(self) -> str:
         """
         return self._engine_type
 
+    @property
+    def timer(self) -> PipelineTimer:
+        return self._timer
+
+    @property
+    def benchmark(self) -> bool:
+        return self._benchmark
+
+    @benchmark.setter
+    def benchmark(self, value: bool):
+        self._benchmark = value
+
     def to_config(self) -> "PipelineConfig":
         """
         :return: PipelineConfig that can be used to reload this object
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index aabcef746f..998836e097 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -238,6 +238,7 @@ def prompt_inference(
             - the list of prompt tokens plus the new, generated token
             - the kv cache that was populated during the inference
         """
+        self.timer.start_inference_stage("prompt_inference")
         # get tokens by attention mask
         tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
 
@@ -254,11 +255,14 @@ def prompt_inference(
                 )
         else:
             # larger prompt size, run through multi-token engine in single pass
+            self.timer.start_inference_stage("multitoken_engine")
             logits, *cache_values = self.multitoken_engine(engine_inputs)
+            self.timer.stop_inference_stage("multitoken_engine")
             kv_cache = self.assemble_kv_cache(cache_values, tokens)
             new_token = self.generate_token(logits[0, -1])
 
         tokens.append(new_token)
+        self.timer.stop_inference_stage("prompt_inference")
 
         return tokens, kv_cache
 
@@ -279,6 +283,7 @@ def autoregressive_inference(
             - the new, generated token
             - the kv cache that was populated during the inference
         """
+        self.timer.start_inference_stage("autoregressive_inference")
         new_token = tokens[-1]
 
         # padding is added to left, so attention mask is 1s from the
@@ -308,11 +313,14 @@ def autoregressive_inference(
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
+        self.timer.start_inference_stage("autoregressive_inference_engine")
         new_logits, *cache_values = self.engine(engine_inputs)
+        self.timer.stop_inference_stage("autoregressive_inference_engine")
         kv_cache = self.assemble_kv_cache(cache_values, tokens)
 
         # Obtain the next token from the logits
         generated_token = self.generate_token(new_logits[0, 0, :])
+        self.timer.stop_inference_stage("autoregressive_inference")
 
         return generated_token, kv_cache
 
@@ -400,14 +408,14 @@ def assemble_kv_cache(
         Restructure the kv cache values from the engine output, so
         that it can be passed to the engine in the next inference run.
 
-        KV Cache concatenation adds an extra length dimension to the output 
+        KV Cache concatenation adds an extra length dimension to the output
         cache, that should be deleted after every inference run.
 
         There are two modes:
-        1. Some values in the cache represent dummy (pad) tokens, padding is 
+        1. Some values in the cache represent dummy (pad) tokens, padding is
             to the left, so the left-most cache value is deleted
         2. The cache is saturated with non-dummy (meaningful) tokens:
-            -   if there is a mandatory start-of-sequence (SOS) token, 
+            -   if there is a mandatory start-of-sequence (SOS) token,
                 we delete the left-most cache value that is not a cache
                 corresponding to SOS token.
             -   otherwise we delete from the left as in (1)
diff --git a/src/deepsparse/timing/pipeline_timer.py b/src/deepsparse/timing/pipeline_timer.py
new file mode 100644
index 0000000000..7874bea513
--- /dev/null
+++ b/src/deepsparse/timing/pipeline_timer.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+from typing import Dict, List
+
+
+__all__ = ["InferenceTimer", "PipelineTimer"]
+
+
+class InferenceTimer:
+    def __init__(self):
+        self._staged_start_times = {}
+        self._staged_stop_times = {}
+
+    def __repr__(self):
+        return f"InferenceTimer({self.times})"
+
+    @property
+    def stages(self) -> List[str]:
+        return list(self._staged_start_times.keys())
+
+    @property
+    def times(self) -> Dict[str, float]:
+        return {stage: self.stage_average_time(stage) for stage in self.stages}
+
+    @property
+    def all_times(self) -> Dict[str, List[float]]:
+        return {stage: self.stage_times(stage) for stage in self.stages}
+
+    def clear(self):
+        self._staged_start_times.clear()
+        self._staged_stop_times.clear()
+
+    def has_stage(self, stage: str) -> bool:
+        return stage in self.stages
+
+    def start(self, stage: str):
+        if stage not in self._staged_start_times:
+            self._staged_start_times[stage] = []
+            self._staged_stop_times[stage] = []
+
+        if len(self._staged_start_times[stage]) != len(self._staged_stop_times[stage]):
+            raise ValueError(
+                f"Attempting to start {stage} before a previous has been stopped:"
+                f" start times len({self._staged_start_times[stage]});"
+                f" stop times len({self._staged_stop_times[stage]})"
+            )
+
+        self._staged_start_times[stage].append(time.perf_counter())
+
+    def stop(self, stage: str):
+        if stage not in self._staged_start_times:
+            raise ValueError(
+                "Attempting to stop a stage that has not been started: "
+                f"{stage}"
+            )
+
+        if (
+            len(self._staged_start_times[stage])
+            != len(self._staged_stop_times[stage]) + 1
+        ):
+            raise ValueError(
+                f"Attempting to stop {stage} before a previous has been started:"
+                f" start times len({self._staged_start_times[stage]});"
+                f" stop times len({self._staged_stop_times[stage]})"
+            )
+
+        self._staged_stop_times[stage].append(time.perf_counter())
+
+    def stage_times(self, stage: str) -> List[float]:
+        if stage not in self._staged_start_times:
+            raise ValueError(
+                "Attempting to get time deltas for a stage that has not been started: "
+                f"{stage}"
+            )
+
+        if len(self._staged_start_times[stage]) != len(self._staged_stop_times[stage]):
+            raise ValueError(
+                "Attempting to get time deltas for a stage that has not been stopped: "
+                f"{stage}"
+            )
+
+        return [
+            self._staged_stop_times[stage][i] - self._staged_start_times[stage][i]
+            for i in range(len(self._staged_start_times[stage]))
+        ]
+
+    def stage_average_time(self, stage: str) -> float:
+        times = self.stage_times(stage)
+
+        return sum(times) / len(times)
+
+
+class PipelineTimer:
+    def __init__(self, enabled: bool = True, multi_inference: bool = False):
+        self._multi_inference = multi_inference
+        self._enabled = enabled
+        self._timers = []
+
+    def __repr__(self):
+        return f"PipelineTimer({self.times})"
+
+    @property
+    def enabled(self):
+        return self._enabled
+
+    @enabled.setter
+    def enabled(self, value):
+        self._enabled = value
+
+    @property
+    def multi_inference(self):
+        return self._multi_inference
+
+    @multi_inference.setter
+    def multi_inference(self, value):
+        self._multi_inference = value
+
+    @property
+    def current_inference(self) -> InferenceTimer:
+        return self._timers[-1] if self._timers else None
+
+    @property
+    def inferences(self) -> List[InferenceTimer]:
+        return self._timers
+
+    @property
+    def stages(self) -> List[str]:
+        stages = set()
+
+        for timer in self._timers:
+            stages.update(timer.stages)
+
+        return list(stages)
+
+    @property
+    def times(self) -> Dict[str, float]:
+        all_times = self.all_times
+
+        return {
+            stage: sum(all_times[stage]) / len(all_times[stage])
+            for stage in self.stages
+        }
+
+    @property
+    def all_times(self) -> Dict[str, List[float]]:
+        all_times = {stage: [] for stage in self.stages}
+
+        for timer in self._timers:
+            for stage, times in timer.all_times.items():
+                all_times[stage].extend(times)
+
+        return all_times
+
+    def reset(self):
+        if not self._enabled:
+            return
+
+        self._check_start_inference()
+
+        if self.multi_inference:
+            self._timers.append(InferenceTimer())
+        else:
+            self._timers[0].clear()
+
+    def start_inference_stage(self, stage: str):
+        if not self._enabled:
+            return
+
+        self._check_start_inference()
+        self._timers[-1].start(stage)
+
+    def stop_inference_stage(self, stage: str):
+        if not self._enabled:
+            return
+
+        self._check_start_inference()
+        self._timers[-1].stop(stage)
+
+    def _check_start_inference(self):
+        if not self.current_inference:
+            self._timers.append(InferenceTimer())

From afd0139a3941fd0799cc2885a3c5064629577cff Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Wed, 24 May 2023 08:39:36 -0400
Subject: [PATCH 32/68] add in force tokens logic

---
 src/deepsparse/pipelines/text_generation.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 998836e097..e70207edb2 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -75,6 +75,10 @@ class TextGenerationPipeline(TransformersPipeline):
         tokens until the end of the sequence is reached.
         Otherwise, it will generate up to the maximum number of tokens or end of
         sequence is reached.
+    :param prompt_batch_threshold: the threshold for the ratio of running the prompt
+        as a single, batched inference vs running the prompt auto-regressively.
+    :param force_max_tokens: if True, the pipeline will generate the maximum number
+        of tokens supplied even if the stop token is reached.
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
 
@@ -84,6 +88,7 @@ def __init__(
         sampling_temperature: float = 1.0,
         max_generated_tokens: Optional[int] = 1024,
         prompt_batch_threshold: float = 0.25,
+        force_max_tokens: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs, _delay_engine_initialize=True)
@@ -95,6 +100,7 @@ def __init__(
         self.sampling_temperature = sampling_temperature
         self.max_generated_tokens = max_generated_tokens
         self.prompt_batch_threshold = prompt_batch_threshold
+        self.force_max_tokens = force_max_tokens
 
         self.engine = Pipeline.create_engine(
             self.onnx_file_path,
@@ -220,7 +226,7 @@ def engine_forward(
             tokens.append(gen_token)
             generated.append(gen_token)
 
-            if gen_token == self.tokenizer.eos_token_id:
+            if gen_token == self.tokenizer.eos_token_id and not self.force_max_tokens:
                 break
 
         return numpy.array([[generated]])

From 30eeda749b33575e6a760f3beb5c562323a7e989 Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Wed, 24 May 2023 09:07:02 -0400
Subject: [PATCH 33/68] remove input validation for text generation pipelines

---
 src/deepsparse/pipelines/text_generation.py | 4 ++--
 src/deepsparse/timing/pipeline_timer.py     | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index e70207edb2..41540e29ea 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -262,7 +262,7 @@ def prompt_inference(
         else:
             # larger prompt size, run through multi-token engine in single pass
             self.timer.start_inference_stage("multitoken_engine")
-            logits, *cache_values = self.multitoken_engine(engine_inputs)
+            logits, *cache_values = self.multitoken_engine(engine_inputs, val_inp=False)
             self.timer.stop_inference_stage("multitoken_engine")
             kv_cache = self.assemble_kv_cache(cache_values, tokens)
             new_token = self.generate_token(logits[0, -1])
@@ -320,7 +320,7 @@ def autoregressive_inference(
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
         self.timer.start_inference_stage("autoregressive_inference_engine")
-        new_logits, *cache_values = self.engine(engine_inputs)
+        new_logits, *cache_values = self.engine(engine_inputs, val_inp=False)
         self.timer.stop_inference_stage("autoregressive_inference_engine")
         kv_cache = self.assemble_kv_cache(cache_values, tokens)
 
diff --git a/src/deepsparse/timing/pipeline_timer.py b/src/deepsparse/timing/pipeline_timer.py
index 7874bea513..13c4f5850b 100644
--- a/src/deepsparse/timing/pipeline_timer.py
+++ b/src/deepsparse/timing/pipeline_timer.py
@@ -63,8 +63,7 @@ def start(self, stage: str):
     def stop(self, stage: str):
         if stage not in self._staged_start_times:
             raise ValueError(
-                "Attempting to stop a stage that has not been started: "
-                f"{stage}"
+                "Attempting to stop a stage that has not been started: " f"{stage}"
             )
 
         if (

From 5882b562487ad7019087ac1cf2669f34dac33281 Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Wed, 24 May 2023 09:30:45 -0400
Subject: [PATCH 34/68] remove multitoken support for now

---
 src/deepsparse/pipelines/text_generation.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 41540e29ea..9486e07904 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -115,9 +115,10 @@ def __init__(
             self.onnx_multitoken_path,
             self._temp_model_directory,
         ) = self._setup_onnx_multitoken_file_path()
-        self.multitoken_engine = Pipeline.create_engine(
-            self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
-        )
+        self.multitoken_engine = None
+        # Pipeline.create_engine(
+        #     self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
+        # )
 
         # override tokenizer to pad to left
         self.tokenizer.padding_side = "left"
@@ -250,7 +251,7 @@ def prompt_inference(
 
         new_token = None
 
-        if len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
+        if True: #len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
             # prompt size is small, run autoregressive inference to populate kv cache
             run_tokens = []
             kv_cache = {}

From 4bbe33dad3a3a2282825c3635107129ee49f2781 Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Thu, 25 May 2023 06:59:49 -0400
Subject: [PATCH 35/68] remove kv cache engine and other fixes

---
 src/deepsparse/engine.py                    | 33 +++++--------
 src/deepsparse/pipeline.py                  |  5 +-
 src/deepsparse/pipelines/text_generation.py | 38 +++++++++------
 src/deepsparse/transformers/helpers.py      | 51 ++++++++++++---------
 src/deepsparse/utils/onnx.py                | 22 ++++-----
 5 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index 6d6c4116ea..d71a8b6b5e 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -276,32 +276,23 @@ def __init__(
         num_streams: int = None,
         scheduler: Scheduler = None,
         input_shapes: List[List[int]] = None,
+        cache_inputs: List[bool] = None,
     ):
         BaseEngine.construct(
             self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
         )
 
-        if self._input_shapes:
-            with override_onnx_input_shapes(
-                self._model_path, self._input_shapes
-            ) as model_path:
-                self._eng_net = LIB.deepsparse_engine(
-                    model_path,
-                    self._batch_size,
-                    self._num_cores,
-                    self._num_streams,
-                    self._scheduler.value,
-                    None,
-                )
-        else:
-            self._eng_net = LIB.deepsparse_engine(
-                self._model_path,
-                self._batch_size,
-                self._num_cores,
-                self._num_streams,
-                self._scheduler.value,
-                None,
-            )
+        num_streams = _validate_num_streams(num_streams, self._num_cores)
+        override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
+        self._eng_net = LIB.deepsparse_engine(
+            self._model_path,
+            self._batch_size,
+            self._num_cores,
+            num_streams,
+            self._scheduler.value,
+            None,
+            cache_inputs,
+        )
 
     def __call__(
         self,
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index be31ab1b51..75abb42939 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -25,7 +25,7 @@
 import numpy
 from pydantic import BaseModel, Field
 
-from deepsparse import Context, Engine, KVCacheEngine, MultiModelEngine, Scheduler
+from deepsparse import Context, Engine, MultiModelEngine, Scheduler
 from deepsparse.benchmark import ORTEngine
 from deepsparse.cpu import cpu_details
 from deepsparse.loggers.base_logger import BaseLogger
@@ -590,7 +590,6 @@ def create_engine(
         engine_type: str,
         engine_args: Dict,
         context: Optional[Context] = None,
-        support_kv_cache: bool = False,
     ) -> Union[Engine, MultiModelEngine, ORTEngine]:
         engine_type = engine_type.lower()
 
@@ -603,8 +602,6 @@ def create_engine(
                     model=onnx_file_path,
                     **engine_args,
                 )
-            if support_kv_cache:
-                return KVCacheEngine(onnx_file_path, **engine_args)
             return Engine(onnx_file_path, **engine_args)
 
         if engine_type == ORT_ENGINE:
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
index 9486e07904..a4a510b74f 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -26,6 +26,7 @@
     overwrite_transformer_onnx_model_inputs,
 )
 from deepsparse.transformers.pipelines import TransformersPipeline
+from deepsparse.utils import get_output_names
 
 
 OPT_CACHE_HIDDEN_DIM = 64
@@ -76,7 +77,10 @@ class TextGenerationPipeline(TransformersPipeline):
         Otherwise, it will generate up to the maximum number of tokens or end of
         sequence is reached.
     :param prompt_batch_threshold: the threshold for the ratio of running the prompt
-        as a single, batched inference vs running the prompt auto-regressively.
+        as a single inference vs running the prompt auto-regressively.
+        If the number of input sequences divided by the max sequence length is
+        greater than the threshold, the prompt will be run as a single inference.
+        Default is None, which will always run auto-regressively.
     :param force_max_tokens: if True, the pipeline will generate the maximum number
         of tokens supplied even if the stop token is reached.
     :param kwargs: kwargs to pass to the TransformersPipeline
@@ -87,7 +91,7 @@ def __init__(
         deterministic: bool = True,
         sampling_temperature: float = 1.0,
         max_generated_tokens: Optional[int] = 1024,
-        prompt_batch_threshold: float = 0.25,
+        prompt_batch_threshold: float = None,
         force_max_tokens: bool = False,
         **kwargs,
     ):
@@ -107,18 +111,22 @@ def __init__(
             self.engine_type,
             self.engine_args,
             self.context,
-            support_kv_cache=True,
         )
 
-        # initialize the auxiliary multitoken engine
-        (
-            self.onnx_multitoken_path,
-            self._temp_model_directory,
-        ) = self._setup_onnx_multitoken_file_path()
-        self.multitoken_engine = None
-        # Pipeline.create_engine(
-        #     self.onnx_multitoken_path, self.engine_type, self.engine_args, self.context
-        # )
+        if prompt_batch_threshold is not None and prompt_batch_threshold < 1:
+            (
+                self.onnx_multitoken_path,
+                self._temp_model_directory,
+            ) = self._setup_onnx_multitoken_file_path()
+            self.multitoken_engine = Pipeline.create_engine(
+                self.onnx_multitoken_path,
+                self.engine_type,
+                self.engine_args,
+                self.context,
+            )
+        else:
+            self.onnx_multitoken_path = None
+            self.multitoken_engine = None
 
         # override tokenizer to pad to left
         self.tokenizer.padding_side = "left"
@@ -251,7 +259,11 @@ def prompt_inference(
 
         new_token = None
 
-        if True: #len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold:
+        if (
+            self.prompt_batch_threshold is None
+            or self.prompt_batch_threshold >= 1
+            or len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold
+        ):
             # prompt size is small, run autoregressive inference to populate kv cache
             run_tokens = []
             kv_cache = {}
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 3e05cd5aaa..adc091150d 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -69,10 +69,21 @@ def get_onnx_path_and_configs(
     if os.path.isfile(model_path) and not require_configs:
         return model_path, None, None
 
-    config_path = None
-    tokenizer_path = None
+    if os.path.isfile(model_path):
+        if require_configs:
+            raise RuntimeError(
+                f"Unable to find config files for model_path {model_path}. "
+                f"model_path must be a directory containing model.onnx, config.json,"
+                f" and tokenizer.json files."
+            )
+
+        return model_path, None, None
 
     if os.path.isdir(model_path):
+        # default to model_path
+        config_path = model_path
+        tokenizer_path = model_path
+
         model_files = os.listdir(model_path)
 
         if _MODEL_DIR_ONNX_NAME not in model_files:
@@ -84,12 +95,13 @@ def get_onnx_path_and_configs(
         onnx_path = os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
 
         # attempt to read config and tokenizer from sparsezoo-like framework directory
-        framework_dir = None
-        if "framework" in model_files:
-            framework_dir = os.path.join(model_path, "framework")
-        if "pytorch" in model_files:
-            framework_dir = os.path.join(model_path, "pytorch")
-        if framework_dir and os.path.isdir(framework_dir):
+        for framework_dir in [
+            os.path.join(model_path, "framework"),
+            os.path.join(model_path, "pytorch"),
+        ]:
+            if not os.path.exists(framework_dir) or not os.path.isdir(framework_dir):
+                continue
+
             framework_files = os.listdir(framework_dir)
             if _MODEL_DIR_CONFIG_NAME in framework_files:
                 config_path = framework_dir
@@ -102,7 +114,9 @@ def get_onnx_path_and_configs(
         if _MODEL_DIR_TOKENIZER_NAME in model_files:
             tokenizer_path = model_path
 
-    elif model_path.startswith("zoo:"):
+        return onnx_path, config_path, tokenizer_path
+
+    if model_path.startswith("zoo:"):
         zoo_model = Model(model_path)
         onnx_path = zoo_model.onnx_model.path
         config_path = _get_file_parent(
@@ -115,20 +129,13 @@ def get_onnx_path_and_configs(
             _MODEL_DIR_TOKENIZER_CONFIG_NAME
         )
         if tokenizer_config_path is not None:
-            tokenizer_config_path.path  # trigger download of tokenizer_config
-    elif require_configs and (config_path is None or tokenizer_path is None):
-        raise RuntimeError(
-            f"Unable to find model and tokenizer config for model_path {model_path}. "
-            f"model_path must be a directory containing model.onnx, config.json, and "
-            f"tokenizer.json files. Found config and tokenizer paths: {config_path}, "
-            f"{tokenizer_path}"
-        )
-    else:
-        raise ValueError(
-            f"model_path {model_path} is not a valid file, directory, or zoo stub"
-        )
+            _ = tokenizer_config_path.path  # trigger download of tokenizer_config
+
+        return onnx_path, config_path, tokenizer_path
 
-    return onnx_path, config_path, tokenizer_path
+    raise ValueError(
+        f"model_path {model_path} is not a valid file, directory, or zoo stub"
+    )
 
 
 def overwrite_transformer_onnx_model_inputs(
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 465b14987e..f68ff259d0 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -129,7 +129,7 @@ def get_external_inputs(onnx_filepath: str) -> List:
     :param onnx_filepath: File path to ONNX model
     :return: List of input objects
     """
-    model = onnx.load(onnx_filepath)
+    model = onnx.load(onnx_filepath, load_external_data=False)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -144,7 +144,7 @@ def get_external_outputs(onnx_filepath: str) -> List:
     :param onnx_filepath: File path to ONNX model
     :return: List of output objects
     """
-    model = onnx.load(onnx_filepath)
+    model = onnx.load(onnx_filepath, load_external_data=False)
     return [output for output in model.graph.output]
 
 
@@ -211,19 +211,14 @@ def override_onnx_batch_size(
         input for input in all_inputs if input.name not in initializer_input_names
     ]
     for external_input in external_inputs:
-        if external_input.name == "cache_length":
-            continue
-        if external_input.name.startswith("past_key_values"):
-            continue  # TODO: really should set to BS * num_heads, skipping for now for testing
         external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
 
-    # Save modified model, this will be cleaned up when context is exited
     if inplace:
         onnx.save(model, onnx_filepath)
         return onnx_filepath
-    else:
-        # Save modified model, this will be cleaned up when context is exited
-        return save_onnx_to_temp_files(model, with_external_data=False)
+
+    # Save modified model, this will be cleaned up when context is exited
+    return save_onnx_to_temp_files(model, with_external_data=False)
 
 
 def override_onnx_input_shapes(
@@ -277,13 +272,12 @@ def override_onnx_input_shapes(
         for dim_idx, dim in enumerate(external_input.type.tensor_type.shape.dim):
             dim.dim_value = input_shapes[input_idx][dim_idx]
 
-    # Save modified model, this will be cleaned up when context is exited
     if inplace:
         onnx.save(model, onnx_filepath)
         return onnx_filepath
-    else:
-        # Save modified model, this will be cleaned up when context is exited
-        return save_onnx_to_temp_files(model, with_external_data=False)
+
+    # Save modified model, this will be cleaned up when context is exited
+    return save_onnx_to_temp_files(model, with_external_data=False)
 
 
 def truncate_onnx_model(

From afa574644f16d240096f4bbd14937d6d99e0996c Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Thu, 25 May 2023 07:24:45 -0400
Subject: [PATCH 36/68] nest input shape override

---
 src/deepsparse/engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index d71a8b6b5e..bcb1b8675a 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -283,7 +283,8 @@ def __init__(
         )
 
         num_streams = _validate_num_streams(num_streams, self._num_cores)
-        override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
+        if self._input_shapes is not None:
+            override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
         self._eng_net = LIB.deepsparse_engine(
             self._model_path,
             self._batch_size,

From e2bb78c5c7d632bc8759f18e2a4b8e457acca37d Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Thu, 25 May 2023 07:26:25 -0400
Subject: [PATCH 37/68] comment out input shape override

---
 src/deepsparse/engine.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index bcb1b8675a..21a7604d3c 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -283,8 +283,8 @@ def __init__(
         )
 
         num_streams = _validate_num_streams(num_streams, self._num_cores)
-        if self._input_shapes is not None:
-            override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
+        # if self._input_shapes is not None:
+        #     override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
         self._eng_net = LIB.deepsparse_engine(
             self._model_path,
             self._batch_size,

From 22990098b2b96a41ee289ea5fcd91fa26df6ad14 Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Thu, 25 May 2023 08:02:19 -0400
Subject: [PATCH 38/68] add non batch override for ORT

---
 src/deepsparse/benchmark/ort_engine.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/benchmark/ort_engine.py b/src/deepsparse/benchmark/ort_engine.py
index 9a79705f78..fba10c4062 100644
--- a/src/deepsparse/benchmark/ort_engine.py
+++ b/src/deepsparse/benchmark/ort_engine.py
@@ -63,7 +63,7 @@ def _validate_ort_import():
 
 
 def _validate_batch_size(batch_size: int) -> int:
-    if batch_size < 1:
+    if batch_size is not None and batch_size < 1:
         raise ValueError("batch_size must be greater than 0")
 
     return batch_size
@@ -130,7 +130,7 @@ def __init__(
                         sess_options,
                         providers=providers,
                     )
-        else:
+        elif batch_size is not None:
             with override_onnx_batch_size(
                 self._model_path, batch_size
             ) as batch_override_model_path:
@@ -139,6 +139,12 @@ def __init__(
                     sess_options,
                     providers=providers,
                 )
+        else:
+            self._eng_net = onnxruntime.InferenceSession(
+                self._model_path,
+                sess_options,
+                providers=providers,
+            )
 
     def __call__(
         self,

From 2935b77e6f933f33ef9686d89311e2ad0dc86b53 Mon Sep 17 00:00:00 2001
From: Mark Kurtz <mark.kurtz@neuralmagic.com>
Date: Fri, 9 Jun 2023 08:23:57 -0400
Subject: [PATCH 39/68] clean up generation pipeline

---
 src/deepsparse/benchmark/ort_engine.py        |  41 ++-----
 src/deepsparse/engine.py                      | 106 +-----------------
 src/deepsparse/tasks.py                       |   2 +-
 src/deepsparse/transformers/helpers.py        |   2 +-
 .../pipelines/text_generation.py              |   1 +
 5 files changed, 15 insertions(+), 137 deletions(-)
 rename src/deepsparse/{ => transformers}/pipelines/text_generation.py (99%)

diff --git a/src/deepsparse/benchmark/ort_engine.py b/src/deepsparse/benchmark/ort_engine.py
index fba10c4062..e9f16c1e0f 100644
--- a/src/deepsparse/benchmark/ort_engine.py
+++ b/src/deepsparse/benchmark/ort_engine.py
@@ -62,7 +62,7 @@ def _validate_ort_import():
         )
 
 
-def _validate_batch_size(batch_size: int) -> int:
+def _validate_batch_size(batch_size: int) -> Optional[int]:
     if batch_size is not None and batch_size < 1:
         raise ValueError("batch_size must be greater than 0")
 
@@ -116,35 +116,15 @@ def __init__(
                     f" num_cores={num_cores}, please specify CPUExecutionProvider"
                 )
 
-        # TODO (michael): Unfortunately we are stacking overrides here, this can be
-        # cleaned up once we pass the loaded ONNX around and not paths
         if self._input_shapes:
-            with override_onnx_input_shapes(
-                self._model_path, self._input_shapes
-            ) as input_override_model_path:
-                with override_onnx_batch_size(
-                    input_override_model_path, batch_size
-                ) as batch_override_model_path:
-                    self._eng_net = onnxruntime.InferenceSession(
-                        batch_override_model_path,
-                        sess_options,
-                        providers=providers,
-                    )
-        elif batch_size is not None:
-            with override_onnx_batch_size(
-                self._model_path, batch_size
-            ) as batch_override_model_path:
-                self._eng_net = onnxruntime.InferenceSession(
-                    batch_override_model_path,
-                    sess_options,
-                    providers=providers,
-                )
-        else:
-            self._eng_net = onnxruntime.InferenceSession(
-                self._model_path,
-                sess_options,
-                providers=providers,
-            )
+            override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
+        if self._batch_size is not None:
+            override_onnx_batch_size(self._model_path, self._batch_size, inplace=True)
+        self._eng_net = onnxruntime.InferenceSession(
+            self._model_path,
+            sess_options,
+            providers=providers,
+        )
 
     def __call__(
         self,
@@ -287,8 +267,7 @@ def run(
         :return: The list of outputs from the model after executing over the inputs
         """
         if val_inp:
-            pass
-            # self._validate_inputs(inp)
+            self._validate_inputs(inp)
         inputs_dict = {name: value for name, value in zip(self.input_names, inp)}
         return self._eng_net.run(self.output_names, inputs_dict)
 
diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index 21a7604d3c..b7641287b2 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -281,10 +281,9 @@ def __init__(
         BaseEngine.construct(
             self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
         )
-
         num_streams = _validate_num_streams(num_streams, self._num_cores)
-        # if self._input_shapes is not None:
-        #     override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
+        if self._input_shapes is not None:
+            override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
         self._eng_net = LIB.deepsparse_engine(
             self._model_path,
             self._batch_size,
@@ -837,107 +836,6 @@ def __init__(
             )
 
 
-class KVCacheEngine(Engine):
-    """
-    Engine that can do kv caching.
-    """
-
-    def __init__(
-        self,
-        model: Union[str, "Model", "File"],
-        batch_size: int = 1,
-        num_cores: int = None,
-        num_streams: int = None,
-        scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-    ):
-        _analytics.send_event("python__engine__init")
-        self._model_path = model_to_path(model)
-        self._batch_size = _validate_batch_size(batch_size)
-        self._num_cores = _validate_num_cores(num_cores)
-        self._scheduler = _validate_scheduler(scheduler)
-        self._input_shapes = input_shapes
-        self._cpu_avx_type = AVX_TYPE
-        self._cpu_vnni = VNNI
-
-        num_streams = _validate_num_streams(num_streams, self._num_cores)
-        if self._input_shapes:
-            raise NotImplementedError("")
-            # with override_onnx_input_shapes(
-            #         self._model_path, self._input_shapes
-            # ) as model_path:
-            #     self._eng_net = LIB.deepsparse_engine(
-            #         model_path,
-            #         self._batch_size,
-            #         self._num_cores,
-            #         num_streams,
-            #         self._scheduler.value,
-            #         None,
-            #     )
-        else:
-            # create a boolean list of every output of the
-            # model (logits, key0, value0, key1, value1, ..., key19, value, 19)
-            kv_cache_bools = [True for i in range(41)]
-            kv_cache_bools[0] = False  # logits ought not to be cached
-
-            self._eng_net = LIB.deepsparse_engine(
-                self._model_path,
-                self._batch_size,
-                self._num_cores,
-                num_streams,
-                self._scheduler.value,
-                None,
-                kv_cache_bools,  # pass in the boolean list
-                0,  # since we start with no initial cache, pass in 0 for the initial cached position
-            )
-
-
-class KVCacheEngine(Engine):
-    """
-    Engine that can do kv caching.
-    """
-
-    def __init__(
-        self,
-        model: Union[str, "Model", "File"],
-        batch_size: int = 1,
-        num_cores: int = None,
-        num_streams: int = None,
-        scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-        kv_cache_bools: List[bool] = None,
-        prev_cache_length: int = 0,
-    ):
-        BaseEngine.construct(
-            self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
-        )
-
-        if kv_cache_bools is None:
-            # If no list was provided, then we assume all outputs except for the first are KV caches
-            # Note: In the future we can look at the names of outputs to be more sure
-            #
-            # Create a boolean list of every output of the model
-            output_names = get_output_names(self._model_path)
-            kv_cache_bools = [True for i in range(len(output_names))]
-            # Assume first input is logits and logits ought not to be cached
-            kv_cache_bools[0] = False
-
-        num_streams = _validate_num_streams(num_streams, self._num_cores)
-        if self._input_shapes:
-            raise NotImplementedError("Don't do this yet :)")
-        else:
-            self._eng_net = LIB.deepsparse_engine(
-                self._model_path,
-                self._batch_size,
-                self._num_cores,
-                num_streams,
-                self._scheduler.value,
-                None,
-                kv_cache_bools,
-                prev_cache_length,
-            )
-
-
 def compile_model(
     model: Union[str, "Model", "File"],
     batch_size: int = 1,
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index 7ea812f4cc..2aee2ddd9d 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -156,7 +156,7 @@ def check_register_task(
             import deepsparse.pipelines.custom_pipeline  # noqa: F401
 
         elif cls.is_text_generation(task):
-            import deepsparse.pipelines.text_generation
+            import deepsparse.transformers.pipelines.text_generation  # noqa: F401
 
         elif cls.is_nlp(task):
             # trigger transformers pipelines to register with Pipeline.register
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index adc091150d..998e301e2d 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -43,7 +43,7 @@
 
 _LOGGER = get_main_logger()
 
-_MODEL_DIR_ONNX_NAME = "model_kvcache.onnx"
+_MODEL_DIR_ONNX_NAME = "model.onnx"
 _MODEL_DIR_CONFIG_NAME = "config.json"
 _MODEL_DIR_TOKENIZER_NAME = "tokenizer.json"
 _MODEL_DIR_TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
similarity index 99%
rename from src/deepsparse/pipelines/text_generation.py
rename to src/deepsparse/transformers/pipelines/text_generation.py
index a4a510b74f..9effbc3315 100644
--- a/src/deepsparse/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -114,6 +114,7 @@ def __init__(
         )
 
         if prompt_batch_threshold is not None and prompt_batch_threshold < 1:
+            raise ValueError("multitoken engine is currently not supported")
             (
                 self.onnx_multitoken_path,
                 self._temp_model_directory,

From dc3d61bee8faf0aade60b07b4576db0cf7cfcfdc Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 5 Jun 2023 15:55:03 +0000
Subject: [PATCH 40/68] initial commit

---
 src/deepsparse/license.py              |  3 +-
 src/deepsparse/server/cli.py           |  1 +
 src/deepsparse/transformers/helpers.py | 12 +++++-
 src/deepsparse/utils/onnx.py           | 55 +++++++++++++++++++-------
 src/deepsparse/yolo/utils/utils.py     | 24 ++++++-----
 5 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/src/deepsparse/license.py b/src/deepsparse/license.py
index ed436aaaf9..f4035072d3 100644
--- a/src/deepsparse/license.py
+++ b/src/deepsparse/license.py
@@ -53,7 +53,7 @@
 def add_deepsparse_license(token_or_path):
     candidate_license_file_path = token_or_path
     if not os.path.exists(token_or_path):
-        # write raw token to temp file for validadation
+        # write raw token to temp file for validation
         candidate_license_tempfile = NamedTemporaryFile()
         candidate_license_file_path = candidate_license_tempfile.name
         with open(candidate_license_file_path, "w") as token_file:
@@ -70,6 +70,7 @@ def add_deepsparse_license(token_or_path):
     license_file_path = _get_license_file_path()
     shutil.copy(candidate_license_file_path, license_file_path)
     _LOGGER.info(f"DeepSparse license file written to {license_file_path}")
+    os.remove(candidate_license_file_path)
 
     # re-validate and print message now that licensee is copied to expected location
     validate_license()
diff --git a/src/deepsparse/server/cli.py b/src/deepsparse/server/cli.py
index 1b323e28e3..29cbc9afb0 100644
--- a/src/deepsparse/server/cli.py
+++ b/src/deepsparse/server/cli.py
@@ -228,6 +228,7 @@ def main(
             loggers={},
         )
 
+        # saving yaml config to temporary directory
         with TemporaryDirectory() as tmp_dir:
             config_path = os.path.join(tmp_dir, "server-config.yaml")
             with open(config_path, "w") as fp:
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d80949eb11..d798231050 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -136,6 +136,7 @@ def overwrite_transformer_onnx_model_inputs(
     batch_size: int = 1,
     max_length: int = 128,
     output_path: Optional[str] = None,
+    inplace: bool = True,
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
     Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
@@ -148,12 +149,21 @@ def overwrite_transformer_onnx_model_inputs(
     :param output_path: if provided, the model will be saved to the given path,
         otherwise, the model will be saved to a named temporary file that will
         be deleted after the program exits
+    :param inplace: if True, the model will be modified in place, otherwise
+        a copy of the model will be saved to a temporary file
     :return: if no output path, a tuple of the saved path to the model, list of
         model input names, and reference to the tempfile object will be returned
         otherwise, only the model input names will be returned
     """
+
+    if inplace and output_path is None:
+        raise ValueError(
+            "Cannot specify both inplace=True and output_path. If inplace=True, "
+            "the model will be modified in place (the returned path will be identical"
+            "to the input path specified in argument `path`)"
+        )
     # overwrite input shapes
-    model = onnx.load(path)
+    model = onnx.load(path, load_external_data=not inplace)
     initializer_input_names = set([node.name for node in model.graph.initializer])
     external_inputs = [
         inp for inp in model.graph.input if inp.name not in initializer_input_names
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 326c4b215d..8b40ab4346 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -24,7 +24,7 @@
 from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
 
 from deepsparse.utils.extractor import Extractor
-from sparsezoo.utils import save_onnx, validate_onnx
+from sparsezoo.utils import onnx_includes_external_data, save_onnx, validate_onnx
 
 
 try:
@@ -53,13 +53,21 @@
 
 
 @contextlib.contextmanager
-def save_onnx_to_temp_files(model: Model, with_external_data=True) -> str:
+def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) -> str:
     """
     Save model to a temporary file. Works for models with external data.
+
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
+    if not onnx_includes_external_data(model) and with_external_data:
+        raise ValueError(
+            "Model does not include external data, it only includes the model graph."
+            "Cannot save its external data to separate a file."
+            "Set argument `with_external_data`=False"
+        )
     shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
+
     if with_external_data:
         external_data = os.path.join(
             tempfile.tempdir, next(tempfile._get_candidate_names())
@@ -195,16 +203,27 @@ def generate_random_inputs(
 
 
 def override_onnx_batch_size(
-    onnx_filepath: str, batch_size: int, inplace: bool = False
+    onnx_filepath: str,
+    batch_size: int,
+    inplace: bool = True,
 ) -> str:
     """
     Rewrite batch sizes of ONNX model, saving the modified model and returning its path
-    :param onnx_filepath: File path to ONNX model
+
+    :param onnx_filepath: File path to ONNX model. If the graph is to be
+        modified in-place, only the model graph will be loaded and modified.
+        Otherwise, the entire model will be loaded and modified, so that
+        external data are saved along the model graph.
     :param batch_size: Override for the batch size dimension
-    :param inplace: If True, overwrite the original model file
-    :return: File path to modified ONNX model
+    :param inplace: If True, overwrite the original model file.
+        Else save the modified model to a temporary file.
+    :return: File path to modified ONNX model.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath, load_external_data=not inplace)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -215,30 +234,38 @@ def override_onnx_batch_size(
 
     # Save modified model, this will be cleaned up when context is exited
     if inplace:
-        onnx.save(model, onnx_filepath)
+        save_onnx(model, onnx_filepath)
         return onnx_filepath
     else:
         # Save modified model, this will be cleaned up when context is exited
-        return save_onnx_to_temp_files(model, with_external_data=False)
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def override_onnx_input_shapes(
     onnx_filepath: str,
     input_shapes: Union[List[int], List[List[int]]],
-    inplace: bool = False,
+    inplace: bool = True,
 ) -> str:
     """
     Rewrite input shapes of ONNX model, saving the modified model and returning its path
-    :param onnx_filepath: File path to ONNX model
+
+    :param onnx_filepath: File path to ONNX model. If the graph is to be
+        modified in-place, only the model graph will be loaded and modified.
+        Otherwise, the entire model will be loaded and modified, so that
+        external data are saved along the model graph.
     :param input_shapes: Override for model's input shapes
     :param inplace: If True, overwrite the original model file
-    :return: File path to modified ONNX model
+    :return: File path to modified ONNX model.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
 
     if input_shapes is None:
         return onnx_filepath
 
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath, load_external_data=not inplace)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -279,7 +306,7 @@ def override_onnx_input_shapes(
         return onnx_filepath
     else:
         # Save modified model, this will be cleaned up when context is exited
-        return save_onnx_to_temp_files(model, with_external_data=False)
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def truncate_onnx_model(
diff --git a/src/deepsparse/yolo/utils/utils.py b/src/deepsparse/yolo/utils/utils.py
index ebbd48233b..3a0f596fe1 100644
--- a/src/deepsparse/yolo/utils/utils.py
+++ b/src/deepsparse/yolo/utils/utils.py
@@ -29,6 +29,7 @@
 import yaml
 
 import torch
+from deepsparse.utils.onnx import save_onnx_to_temp_files
 from deepsparse.yolo.schemas import YOLOOutput
 from sparsezoo.utils import save_onnx
 
@@ -341,7 +342,7 @@ def get_onnx_expected_image_shape(onnx_model: onnx.ModelProto) -> Tuple[int, ...
 
 
 def modify_yolo_onnx_input_shape(
-    model_path: str, image_shape: Tuple[int, int]
+    model_path: str, image_shape: Tuple[int, int], inplace: bool = True
 ) -> Tuple[str, Optional[NamedTemporaryFile]]:
     """
     Creates a new YOLO ONNX model from the given path that accepts the given input
@@ -350,13 +351,17 @@ def modify_yolo_onnx_input_shape(
 
     :param model_path: file path to YOLO ONNX model
     :param image_shape: 2-tuple of the image shape to resize this yolo model to
-    :return: filepath to an onnx model reshaped to the given input shape will be the
-        original path if the shape is the same.  Additionally returns the
-        NamedTemporaryFile for managing the scope of the object for file deletion
+    :param inplace: if True, modifies the given model_path in-place, otherwise
+        saves the modified model to a temporary file
+    :return: filepath to an onnx model reshaped to the given input shape.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
     has_postprocessing = yolo_onnx_has_postprocessing(model_path)
 
-    model = onnx.load(model_path)
+    model = onnx.load(model_path, load_external_data=not inplace)
     model_input = model.graph.input[0]
 
     initial_x, initial_y = get_onnx_expected_image_shape(model)
@@ -399,10 +404,11 @@ def modify_yolo_onnx_input_shape(
         )
         set_tensor_dim_shape(model.graph.output[0], 1, num_predictions)
 
-    tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
-    save_onnx(model, tmp_file.name)
-
-    return tmp_file.name, tmp_file
+    if inplace:
+        save_onnx(model, model_path)
+        return model_path
+    else:
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def get_tensor_dim_shape(tensor: onnx.TensorProto, dim: int) -> int:

From a294265a794c72046cb13115b5142fb6a70c2c68 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Mon, 5 Jun 2023 17:59:30 +0200
Subject: [PATCH 41/68] Update src/deepsparse/license.py

---
 src/deepsparse/license.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/license.py b/src/deepsparse/license.py
index f4035072d3..06acdd2f0c 100644
--- a/src/deepsparse/license.py
+++ b/src/deepsparse/license.py
@@ -70,7 +70,6 @@ def add_deepsparse_license(token_or_path):
     license_file_path = _get_license_file_path()
     shutil.copy(candidate_license_file_path, license_file_path)
     _LOGGER.info(f"DeepSparse license file written to {license_file_path}")
-    os.remove(candidate_license_file_path)
 
     # re-validate and print message now that licensee is copied to expected location
     validate_license()

From af97f2b4e31ea31eca2b57dc95c8e8be969af423 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 7 Jun 2023 13:14:41 +0000
Subject: [PATCH 42/68] limit to 150mb

---
 src/deepsparse/transformers/helpers.py        |   8 +-
 src/deepsparse/utils/onnx.py                  |   3 +
 tests/conftest.py                             |  35 ++
 .../helpers/test_config_generation.py         |   3 +
 .../loggers/test_prometheus_logger.py         |   3 +
 tests/server/test_app.py                      | 332 +++++------
 tests/server/test_config.py                   | 444 +++++++--------
 tests/server/test_endpoints.py                | 536 +++++++++---------
 tests/server/test_loggers.py                  | 486 ++++++++--------
 tests/server/test_system_logging.py           | 338 +++++------
 10 files changed, 1118 insertions(+), 1070 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index d798231050..847a7a9924 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -156,12 +156,14 @@ def overwrite_transformer_onnx_model_inputs(
         otherwise, only the model input names will be returned
     """
 
-    if inplace and output_path is None:
+    if inplace and output_path is not None:
         raise ValueError(
             "Cannot specify both inplace=True and output_path. If inplace=True, "
             "the model will be modified in place (the returned path will be identical"
             "to the input path specified in argument `path`)"
         )
+    if inplace:
+        output_path = path
     # overwrite input shapes
     model = onnx.load(path, load_external_data=not inplace)
     initializer_input_names = set([node.name for node in model.graph.initializer])
@@ -175,14 +177,14 @@ def overwrite_transformer_onnx_model_inputs(
         input_names.append(external_input.name)
 
     # Save modified model
-    if output_path is None:
+    if not inplace:
         tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
         save_onnx(model, tmp_file.name)
         return tmp_file.name, input_names, tmp_file
     else:
         save_onnx(model, output_path)
 
-        return input_names
+        return output_path, input_names, None
 
 
 def _get_file_parent(file_path: str) -> str:
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 8b40ab4346..00f5f24233 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -60,6 +60,7 @@ def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) ->
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
+
     if not onnx_includes_external_data(model) and with_external_data:
         raise ValueError(
             "Model does not include external data, it only includes the model graph."
@@ -67,6 +68,7 @@ def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) ->
             "Set argument `with_external_data`=False"
         )
     shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
+    _LOGGER.warning(f"Saving model to temporary directory: {tempfile.tempdir}")
 
     if with_external_data:
         external_data = os.path.join(
@@ -385,6 +387,7 @@ def truncate_onnx_model(
             output.type.tensor_type.shape.Clear()
 
     # save and check model
+    _LOGGER.info("Saving truncated model to %s", output_filepath)
     save_onnx(extracted_model, output_filepath, "external_data")
     validate_onnx(output_filepath)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 323c0b703e..62f781f043 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import tempfile
 from subprocess import Popen
 from typing import List
 
@@ -20,6 +21,14 @@
 from tests.helpers import delete_file
 
 
+def _get_files(directory: str) -> List[str]:
+    list_filepaths = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            list_filepaths.append(os.path.join(os.path.abspath(root), file))
+    return list_filepaths
+
+
 @pytest.fixture
 def cleanup():
     filenames: List[str] = []
@@ -50,3 +59,29 @@ def cleanup():
         )
         for proc in processes:
             proc.terminate()
+
+
+@pytest.fixture(scope="session", autouse=True)
+def check_for_created_files():
+    start_files_root = _get_files(directory=r".")
+    start_files_temp = _get_files(directory=tempfile.gettempdir())
+    yield
+    end_files_root = _get_files(directory=r".")
+    end_files_temp = _get_files(directory=tempfile.gettempdir())
+
+    assert len(start_files_root) >= len(end_files_root), (
+        f"{len(end_files_root) - len(start_files_root)} "
+        f"files created in current working "
+        f"directory during pytest run. "
+        f"Created files: {set(end_files_root) - set(start_files_root)}"
+    )
+    max_allowed_sized_temp_files_megabytes = 150
+    size_of_temp_files_bytes = sum(
+        os.path.getsize(path) for path in set(end_files_temp) - set(start_files_temp)
+    )
+    size_of_temp_files_megabytes = size_of_temp_files_bytes / 1024 / 1024
+    assert max_allowed_sized_temp_files_megabytes >= size_of_temp_files_megabytes, (
+        f"{size_of_temp_files_megabytes} "
+        f"megabytes of temp files created in temp directory during pytest run. "
+        f"Created files: {set(end_files_temp) - set(start_files_temp)}"
+    )
diff --git a/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py b/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py
index 9350f22c6e..7cf6ad0c07 100644
--- a/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py
+++ b/tests/deepsparse/loggers/metric_functions/helpers/test_config_generation.py
@@ -14,6 +14,7 @@
 
 
 import os
+import shutil
 
 import yaml
 
@@ -155,6 +156,8 @@ def test_data_logging_config_from_predefined(
         with open(os.path.join(tmp_path, "data_logging_config.yaml"), "r") as stream:
             string_result_saved = yaml.safe_load(stream)
         assert string_result_saved == yaml.safe_load(expected_result)
+        return
+    shutil.rmtree(tmp_path, ignore_errors=True)
 
 
 result_1 = """loggers:
diff --git a/tests/deepsparse/loggers/test_prometheus_logger.py b/tests/deepsparse/loggers/test_prometheus_logger.py
index e2935cfb62..689b5163af 100644
--- a/tests/deepsparse/loggers/test_prometheus_logger.py
+++ b/tests/deepsparse/loggers/test_prometheus_logger.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 
+import shutil
+
 import requests
 
 import pytest
@@ -119,6 +121,7 @@ def test_prometheus_logger(
     count_request_text = float(text_log_lines[98].split(" ")[1])
 
     assert count_request_request == count_request_text == no_iterations
+    shutil.rmtree(tmp_path)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/server/test_app.py b/tests/server/test_app.py
index 9bc71e1a36..678152adc9 100644
--- a/tests/server/test_app.py
+++ b/tests/server/test_app.py
@@ -1,166 +1,166 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from copy import deepcopy
-from re import escape
-from unittest.mock import patch
-
-import pytest
-from deepsparse.server.config import EndpointConfig, ServerConfig
-from deepsparse.server.server import _build_app
-
-
-def test_add_multiple_endpoints_with_no_route():
-    with pytest.raises(
-        ValueError,
-        match=(
-            "must specify `route` for all endpoints if multiple endpoints are used."
-        ),
-    ):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                endpoints=[
-                    EndpointConfig(task="", model="", route=None),
-                    EndpointConfig(task="", model="", route=None),
-                ],
-                loggers={},
-            )
-        )
-
-
-def test_add_multiple_endpoints_with_same_route():
-    with pytest.raises(ValueError, match="asdf specified 2 times"):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                endpoints=[
-                    EndpointConfig(task="", model="", route="asdf"),
-                    EndpointConfig(task="", model="", route="asdf"),
-                ],
-                loggers={},
-            )
-        )
-
-
-def test_invalid_integration():
-    with pytest.raises(
-        ValueError,
-        match=escape(
-            "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
-        ),
-    ):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                integration="asdf",
-                endpoints=[],
-                loggers={},
-            )
-        )
-
-
-def test_pytorch_num_threads():
-    torch = pytest.importorskip("torch")
-
-    orig_num_threads = torch.get_num_threads()
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            pytorch_num_threads=None,
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert torch.get_num_threads() == orig_num_threads
-
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            pytorch_num_threads=1,
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert torch.get_num_threads() == 1
-
-
-@patch.dict(os.environ, deepcopy(os.environ))
-def test_thread_pinning_none():
-    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            engine_thread_pinning="none",
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-
-
-@patch.dict(os.environ, deepcopy(os.environ))
-def test_thread_pinning_numa():
-    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            engine_thread_pinning="numa",
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
-
-
-@patch.dict(os.environ, deepcopy(os.environ))
-def test_thread_pinning_cores():
-    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-    _build_app(
-        ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            engine_thread_pinning="core",
-            endpoints=[],
-            loggers={},
-        )
-    )
-    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
-    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-
-
-def test_invalid_thread_pinning():
-    with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
-        _build_app(
-            ServerConfig(
-                num_cores=1,
-                num_workers=1,
-                engine_thread_pinning="asdf",
-                endpoints=[],
-                loggers={},
-            )
-        )
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# import os
+# from copy import deepcopy
+# from re import escape
+# from unittest.mock import patch
+#
+# import pytest
+# from deepsparse.server.config import EndpointConfig, ServerConfig
+# from deepsparse.server.server import _build_app
+#
+#
+# def test_add_multiple_endpoints_with_no_route():
+#     with pytest.raises(
+#         ValueError,
+#         match=(
+#             "must specify `route` for all endpoints if multiple endpoints are used."
+#         ),
+#     ):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 endpoints=[
+#                     EndpointConfig(task="", model="", route=None),
+#                     EndpointConfig(task="", model="", route=None),
+#                 ],
+#                 loggers={},
+#             )
+#         )
+#
+#
+# def test_add_multiple_endpoints_with_same_route():
+#     with pytest.raises(ValueError, match="asdf specified 2 times"):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 endpoints=[
+#                     EndpointConfig(task="", model="", route="asdf"),
+#                     EndpointConfig(task="", model="", route="asdf"),
+#                 ],
+#                 loggers={},
+#             )
+#         )
+#
+#
+# def test_invalid_integration():
+#     with pytest.raises(
+#         ValueError,
+#         match=escape(
+#             "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
+#         ),
+#     ):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 integration="asdf",
+#                 endpoints=[],
+#                 loggers={},
+#             )
+#         )
+#
+#
+# def test_pytorch_num_threads():
+#     torch = pytest.importorskip("torch")
+#
+#     orig_num_threads = torch.get_num_threads()
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             pytorch_num_threads=None,
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert torch.get_num_threads() == orig_num_threads
+#
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             pytorch_num_threads=1,
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert torch.get_num_threads() == 1
+#
+#
+# @patch.dict(os.environ, deepcopy(os.environ))
+# def test_thread_pinning_none():
+#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             engine_thread_pinning="none",
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+#
+#
+# @patch.dict(os.environ, deepcopy(os.environ))
+# def test_thread_pinning_numa():
+#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             engine_thread_pinning="numa",
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
+#
+#
+# @patch.dict(os.environ, deepcopy(os.environ))
+# def test_thread_pinning_cores():
+#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+#     _build_app(
+#         ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             engine_thread_pinning="core",
+#             endpoints=[],
+#             loggers={},
+#         )
+#     )
+#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
+#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+#
+#
+# def test_invalid_thread_pinning():
+#     with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
+#         _build_app(
+#             ServerConfig(
+#                 num_cores=1,
+#                 num_workers=1,
+#                 engine_thread_pinning="asdf",
+#                 endpoints=[],
+#                 loggers={},
+#             )
+#         )
diff --git a/tests/server/test_config.py b/tests/server/test_config.py
index b1c1c75a84..f2f9b0e6fe 100644
--- a/tests/server/test_config.py
+++ b/tests/server/test_config.py
@@ -1,222 +1,222 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import yaml
-
-import pytest
-from deepsparse.server.config import (
-    EndpointConfig,
-    ImageSizesConfig,
-    MetricFunctionConfig,
-    SequenceLengthsConfig,
-    ServerConfig,
-)
-
-
-def test_no_bucketing_config():
-    cfg = EndpointConfig(task="", model="").to_pipeline_config()
-    assert cfg.input_shapes is None
-    assert cfg.kwargs == {}
-
-
-@pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
-def test_bucketing_sequence_length_for_cv(task):
-    with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
-        EndpointConfig(
-            task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-        ).to_pipeline_config()
-
-
-@pytest.mark.parametrize(
-    "task", ["question_answering", "text_classification", "token_classification"]
-)
-def test_bucketing_image_size_for_nlp(task):
-    with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
-        EndpointConfig(
-            task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
-        ).to_pipeline_config()
-
-
-def test_bucketing_zero_sequence_length():
-    with pytest.raises(ValueError, match="at least one sequence length"):
-        EndpointConfig(
-            task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-        ).to_pipeline_config()
-
-
-def test_bucketing_zero_image_size():
-    with pytest.raises(ValueError, match="at least one image size"):
-        EndpointConfig(
-            task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
-        ).to_pipeline_config()
-
-
-def test_bucketing_one_sequence_length():
-    cfg = EndpointConfig(
-        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
-    ).to_pipeline_config()
-    assert cfg.input_shapes is None
-    assert cfg.kwargs == {"sequence_length": 32}
-
-
-def test_bucketing_multi_sequence_length():
-    cfg = EndpointConfig(
-        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
-    ).to_pipeline_config()
-    assert cfg.input_shapes is None
-    assert cfg.kwargs == {"sequence_length": [32, 64]}
-
-
-def test_bucketing_one_image_size():
-    cfg = EndpointConfig(
-        task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
-    ).to_pipeline_config()
-    assert cfg.input_shapes == [[256, 256]]
-    assert cfg.kwargs == {}
-
-
-def test_endpoint_config_to_pipeline_copy_fields():
-    cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
-    assert cfg.task == "qa"
-    assert cfg.model_path == "zxcv"
-
-    cfg = EndpointConfig(task="", model="").to_pipeline_config()
-    assert cfg.batch_size == 1
-
-    cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
-    assert cfg.batch_size == 64
-
-
-def test_yaml_load_config(tmp_path):
-    server_config = ServerConfig(
-        num_cores=1,
-        num_workers=2,
-        integration="sagemaker",
-        endpoints=[
-            EndpointConfig(
-                name="asdf",
-                route="qwer",
-                task="uiop",
-                model="hjkl",
-                batch_size=1,
-                bucketing=None,
-            ),
-            EndpointConfig(
-                name="asdfd",
-                route="qwer",
-                task="uiop",
-                model="hjkl",
-                batch_size=2,
-                bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
-            ),
-            EndpointConfig(
-                name="asdfde",
-                route="qwer",
-                task="uiop",
-                model="hjkl",
-                batch_size=3,
-                bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
-            ),
-        ],
-        loggers={},
-    )
-
-    path = tmp_path / "config.yaml"
-    with open(path, "w") as fp:
-        yaml.dump(server_config.dict(), fp)
-
-    with open(path) as fp:
-        obj = yaml.load(fp, Loader=yaml.Loader)
-    server_config2 = ServerConfig(**obj)
-    assert server_config == server_config2
-
-
-metric_function_config_yaml_1 = """
-  func: identity
-  frequency: 5
-  loggers:
-    - python"""
-
-metric_function_config_yaml_2 = """
-  func: numpy.max"""
-
-metric_function_config_yaml_3 = """
-  func: numpy.max
-  frequency: 0"""
-
-
-@pytest.mark.parametrize(
-    "config_yaml, should_fail, instance_type",
-    [
-        (metric_function_config_yaml_1, False, MetricFunctionConfig),
-        (metric_function_config_yaml_2, False, MetricFunctionConfig),
-        (
-            metric_function_config_yaml_3,
-            True,
-            MetricFunctionConfig,
-        ),  # frequency cannot be zero
-    ],
-)
-def test_function_logging_config(config_yaml, should_fail, instance_type):
-    obj = yaml.safe_load(config_yaml)
-    if should_fail:
-        with pytest.raises(Exception):
-            MetricFunctionConfig(**obj)
-    else:
-        assert MetricFunctionConfig(**obj)
-
-
-def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
-    return ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                name=endpoint_1_name,
-                task=task_name,
-                model="hjkl",
-            ),
-            EndpointConfig(
-                name=endpoint_2_name,
-                task=task_name,
-                model="hjkl",
-            ),
-        ]
-    )
-
-
-@pytest.mark.parametrize(
-    "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
-    [
-        ("some_task", None, None, False, "some_task-0", "some_task-1"),
-        ("some_task", "name_1", None, False, "name_1", "some_task-0"),
-        ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
-        ("some_task", "name_1", "name_1", True, None, None),
-    ],
-)
-def test_unique_endpoint_names(
-    task_name,
-    endpoint_1_name,
-    endpoint_2_name,
-    raise_error,
-    expected_endpoint_1_name,
-    expected_endpoint_2_name,
-):
-    if raise_error:
-        with pytest.raises(ValueError):
-            _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-            return
-        return
-
-    server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-    assert server_config.endpoints[0].name == expected_endpoint_1_name
-    assert server_config.endpoints[1].name == expected_endpoint_2_name
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# import yaml
+#
+# import pytest
+# from deepsparse.server.config import (
+#     EndpointConfig,
+#     ImageSizesConfig,
+#     MetricFunctionConfig,
+#     SequenceLengthsConfig,
+#     ServerConfig,
+# )
+#
+#
+# def test_no_bucketing_config():
+#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
+#     assert cfg.input_shapes is None
+#     assert cfg.kwargs == {}
+#
+#
+# @pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
+# def test_bucketing_sequence_length_for_cv(task):
+#     with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
+#         EndpointConfig(
+#             task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+#         ).to_pipeline_config()
+#
+#
+# @pytest.mark.parametrize(
+#     "task", ["question_answering", "text_classification", "token_classification"]
+# )
+# def test_bucketing_image_size_for_nlp(task):
+#     with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
+#         EndpointConfig(
+#             task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
+#         ).to_pipeline_config()
+#
+#
+# def test_bucketing_zero_sequence_length():
+#     with pytest.raises(ValueError, match="at least one sequence length"):
+#         EndpointConfig(
+#             task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+#         ).to_pipeline_config()
+#
+#
+# def test_bucketing_zero_image_size():
+#     with pytest.raises(ValueError, match="at least one image size"):
+#         EndpointConfig(
+#             task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
+#         ).to_pipeline_config()
+#
+#
+# def test_bucketing_one_sequence_length():
+#     cfg = EndpointConfig(
+#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
+#     ).to_pipeline_config()
+#     assert cfg.input_shapes is None
+#     assert cfg.kwargs == {"sequence_length": 32}
+#
+#
+# def test_bucketing_multi_sequence_length():
+#     cfg = EndpointConfig(
+#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
+#     ).to_pipeline_config()
+#     assert cfg.input_shapes is None
+#     assert cfg.kwargs == {"sequence_length": [32, 64]}
+#
+#
+# def test_bucketing_one_image_size():
+#     cfg = EndpointConfig(
+#         task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
+#     ).to_pipeline_config()
+#     assert cfg.input_shapes == [[256, 256]]
+#     assert cfg.kwargs == {}
+#
+#
+# def test_endpoint_config_to_pipeline_copy_fields():
+#     cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
+#     assert cfg.task == "qa"
+#     assert cfg.model_path == "zxcv"
+#
+#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
+#     assert cfg.batch_size == 1
+#
+#     cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
+#     assert cfg.batch_size == 64
+#
+#
+# def test_yaml_load_config(tmp_path):
+#     server_config = ServerConfig(
+#         num_cores=1,
+#         num_workers=2,
+#         integration="sagemaker",
+#         endpoints=[
+#             EndpointConfig(
+#                 name="asdf",
+#                 route="qwer",
+#                 task="uiop",
+#                 model="hjkl",
+#                 batch_size=1,
+#                 bucketing=None,
+#             ),
+#             EndpointConfig(
+#                 name="asdfd",
+#                 route="qwer",
+#                 task="uiop",
+#                 model="hjkl",
+#                 batch_size=2,
+#                 bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
+#             ),
+#             EndpointConfig(
+#                 name="asdfde",
+#                 route="qwer",
+#                 task="uiop",
+#                 model="hjkl",
+#                 batch_size=3,
+#                 bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
+#             ),
+#         ],
+#         loggers={},
+#     )
+#
+#     path = tmp_path / "config.yaml"
+#     with open(path, "w") as fp:
+#         yaml.dump(server_config.dict(), fp)
+#
+#     with open(path) as fp:
+#         obj = yaml.load(fp, Loader=yaml.Loader)
+#     server_config2 = ServerConfig(**obj)
+#     assert server_config == server_config2
+#
+#
+# metric_function_config_yaml_1 = """
+#   func: identity
+#   frequency: 5
+#   loggers:
+#     - python"""
+#
+# metric_function_config_yaml_2 = """
+#   func: numpy.max"""
+#
+# metric_function_config_yaml_3 = """
+#   func: numpy.max
+#   frequency: 0"""
+#
+#
+# @pytest.mark.parametrize(
+#     "config_yaml, should_fail, instance_type",
+#     [
+#         (metric_function_config_yaml_1, False, MetricFunctionConfig),
+#         (metric_function_config_yaml_2, False, MetricFunctionConfig),
+#         (
+#             metric_function_config_yaml_3,
+#             True,
+#             MetricFunctionConfig,
+#         ),  # frequency cannot be zero
+#     ],
+# )
+# def test_function_logging_config(config_yaml, should_fail, instance_type):
+#     obj = yaml.safe_load(config_yaml)
+#     if should_fail:
+#         with pytest.raises(Exception):
+#             MetricFunctionConfig(**obj)
+#     else:
+#         assert MetricFunctionConfig(**obj)
+#
+#
+# def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
+#     return ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 name=endpoint_1_name,
+#                 task=task_name,
+#                 model="hjkl",
+#             ),
+#             EndpointConfig(
+#                 name=endpoint_2_name,
+#                 task=task_name,
+#                 model="hjkl",
+#             ),
+#         ]
+#     )
+#
+#
+# @pytest.mark.parametrize(
+#     "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
+#     [
+#         ("some_task", None, None, False, "some_task-0", "some_task-1"),
+#         ("some_task", "name_1", None, False, "name_1", "some_task-0"),
+#         ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
+#         ("some_task", "name_1", "name_1", True, None, None),
+#     ],
+# )
+# def test_unique_endpoint_names(
+#     task_name,
+#     endpoint_1_name,
+#     endpoint_2_name,
+#     raise_error,
+#     expected_endpoint_1_name,
+#     expected_endpoint_2_name,
+# ):
+#     if raise_error:
+#         with pytest.raises(ValueError):
+#             _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+#             return
+#         return
+#
+#     server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+#     assert server_config.endpoints[0].name == expected_endpoint_1_name
+#     assert server_config.endpoints[1].name == expected_endpoint_2_name
diff --git a/tests/server/test_endpoints.py b/tests/server/test_endpoints.py
index f028b37e75..411fb46446 100644
--- a/tests/server/test_endpoints.py
+++ b/tests/server/test_endpoints.py
@@ -1,268 +1,268 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List
-from unittest.mock import Mock
-
-from pydantic import BaseModel
-
-import pytest
-from deepsparse.loggers import MultiLogger
-from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
-from deepsparse.server.server import _add_pipeline_endpoint, _build_app
-from fastapi import FastAPI, UploadFile
-from fastapi.testclient import TestClient
-from tests.utils import mock_engine
-
-
-class FromFilesSchema(BaseModel):
-    def from_files(self, f):
-        # do nothing - this method exists just to test files endpoint logic
-        ...
-
-
-class StrSchema(BaseModel):
-    value: str
-
-
-def parse(v: StrSchema) -> int:
-    return int(v.value)
-
-
-class TestStatusEndpoints:
-    @pytest.fixture(scope="class")
-    def server_config(self):
-        server_config = ServerConfig(
-            num_cores=1, num_workers=1, endpoints=[], loggers={}
-        )
-        yield server_config
-
-    @pytest.fixture(scope="class")
-    def client(self, server_config):
-        yield TestClient(_build_app(server_config))
-
-    def test_config(self, server_config, client):
-        response = client.get("/config")
-        loaded = ServerConfig(**response.json())
-        assert loaded == server_config
-
-    @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
-    def test_pings_exist(self, client, route):
-        response = client.get(route)
-        assert response.status_code == 200
-        assert response.json() is True
-
-    def test_docs_exist(self, client):
-        assert client.get("/docs").status_code == 200
-
-    def test_home_redirects_to_docs(self, client):
-        response = client.get("/")
-        assert response.status_code == 200
-        assert response.request.path_url == "/docs"
-        assert len(response.history) > 0
-        assert response.history[-1].is_redirect
-
-
-class TestMockEndpoints:
-    @pytest.fixture(scope="class")
-    def server_config(self):
-        server_config = ServerConfig(
-            num_cores=1, num_workers=1, endpoints=[], loggers={}
-        )
-        yield server_config
-
-    @pytest.fixture(scope="class")
-    def app(self, server_config):
-        yield _build_app(server_config)
-
-    @pytest.fixture(scope="class")
-    def client(self, app):
-        yield TestClient(app)
-
-    def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
-        mock_pipeline = Mock(
-            side_effect=parse,
-            input_schema=StrSchema,
-            output_schema=int,
-            logger=MultiLogger([]),
-        )
-        _add_pipeline_endpoint(
-            app,
-            system_logging_config=SystemLoggingConfig(),
-            endpoint_config=Mock(route="/predict/parse_int"),
-            pipeline=mock_pipeline,
-        )
-        assert app.routes[-1].path == "/predict/parse_int"
-        assert app.routes[-1].response_model is int
-        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-        assert app.routes[-1].methods == {"POST"}
-
-        for v in ["1234", "5678"]:
-            response = client.post("/predict/parse_int", json=dict(value=v))
-            assert response.status_code == 200
-            assert response.json() == int(v)
-
-    def test_add_model_endpoint_with_from_files(self, app):
-        _add_pipeline_endpoint(
-            app,
-            system_logging_config=Mock(),
-            endpoint_config=Mock(route="/predict/parse_int"),
-            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-        )
-        assert app.routes[-2].path == "/predict/parse_int"
-        assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
-        assert app.routes[-1].path == "/predict/parse_int/from_files"
-        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-        assert app.routes[-1].response_model is int
-        assert app.routes[-1].methods == {"POST"}
-
-    def test_sagemaker_only_adds_one_endpoint(self, app):
-        num_routes = len(app.routes)
-        _add_pipeline_endpoint(
-            app,
-            endpoint_config=Mock(route="/predict/parse_int"),
-            system_logging_config=Mock(),
-            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-            integration="sagemaker",
-        )
-        assert len(app.routes) == num_routes + 1
-        assert app.routes[-1].path == "/invocations"
-        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-
-        num_routes = len(app.routes)
-        _add_pipeline_endpoint(
-            app,
-            endpoint_config=Mock(route="/predict/parse_int"),
-            system_logging_config=Mock(),
-            pipeline=Mock(input_schema=StrSchema, output_schema=int),
-            integration="sagemaker",
-        )
-        assert len(app.routes) == num_routes + 1
-        assert app.routes[-1].path == "/invocations"
-        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-
-    def test_add_endpoint_with_no_route_specified(self, app):
-        _add_pipeline_endpoint(
-            app,
-            endpoint_config=Mock(route=None),
-            system_logging_config=Mock(),
-            pipeline=Mock(input_schema=StrSchema, output_schema=int),
-        )
-        assert app.routes[-1].path == "/predict"
-
-
-class TestActualModelEndpoints:
-    @pytest.fixture(scope="class")
-    def client(self):
-        stub = (
-            "zoo:nlp/text_classification/distilbert-none/"
-            "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
-        )
-        server_config = ServerConfig(
-            num_cores=1,
-            num_workers=1,
-            endpoints=[
-                EndpointConfig(
-                    route="/predict/dynamic-batch",
-                    task="text-classification",
-                    model=stub,
-                    batch_size=1,
-                ),
-                EndpointConfig(
-                    route="/predict/static-batch",
-                    task="text-classification",
-                    model=stub,
-                    batch_size=2,
-                ),
-            ],
-            loggers={},  # do not instantiate any loggers
-        )
-        with mock_engine(rng_seed=0):
-            app = _build_app(server_config)
-        yield TestClient(app)
-
-    def test_static_batch_errors_on_wrong_batch_size(self, client):
-        with pytest.raises(
-            RuntimeError,
-            match=(
-                "batch size of 1 passed into pipeline is "
-                "not divisible by model batch size of 2"
-            ),
-        ):
-            client.post("/predict/static-batch", json={"sequences": "today is great"})
-
-    def test_static_batch_good_request(self, client):
-        response = client.post(
-            "/predict/static-batch",
-            json={"sequences": ["today is great", "today is terrible"]},
-        )
-        assert response.status_code == 200
-        output = response.json()
-        assert len(output["labels"]) == 2
-        assert len(output["scores"]) == 2
-
-    @pytest.mark.parametrize(
-        "seqs",
-        [
-            ["today is great"],
-            ["today is great", "today is terrible"],
-            ["the first sentence", "the second sentence", "the third sentence"],
-        ],
-    )
-    def test_dynamic_batch_any(self, client, seqs):
-        response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
-        assert response.status_code == 200
-        output = response.json()
-        assert len(output["labels"]) == len(seqs)
-        assert len(output["scores"]) == len(seqs)
-
-
-class TestDynamicEndpoints:
-    @pytest.fixture(scope="class")
-    def client(self):
-        server_config = ServerConfig(
-            num_cores=1, num_workers=1, endpoints=[], loggers=None
-        )
-        with mock_engine(rng_seed=0):
-            app = _build_app(server_config)
-            yield TestClient(app)
-
-
-@mock_engine(rng_seed=0)
-def test_dynamic_add_and_remove_endpoint(engine_mock):
-    server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
-    app = _build_app(server_config)
-    client = TestClient(app)
-
-    # assert /predict doesn't exist
-    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
-
-    # add /predict
-    response = client.post(
-        "/endpoints",
-        json=EndpointConfig(task="text-classification", model="default").dict(),
-    )
-    assert response.status_code == 200
-    response = client.post("/predict", json=dict(sequences="asdf"))
-    assert response.status_code == 200
-
-    # remove /predict
-    response = client.delete(
-        "/endpoints",
-        json=EndpointConfig(
-            route="/predict", task="text-classification", model="default"
-        ).dict(),
-    )
-    assert response.status_code == 200
-    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# from typing import List
+# from unittest.mock import Mock
+#
+# from pydantic import BaseModel
+#
+# import pytest
+# from deepsparse.loggers import MultiLogger
+# from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
+# from deepsparse.server.server import _add_pipeline_endpoint, _build_app
+# from fastapi import FastAPI, UploadFile
+# from fastapi.testclient import TestClient
+# from tests.utils import mock_engine
+#
+#
+# class FromFilesSchema(BaseModel):
+#     def from_files(self, f):
+#         # do nothing - this method exists just to test files endpoint logic
+#         ...
+#
+#
+# class StrSchema(BaseModel):
+#     value: str
+#
+#
+# def parse(v: StrSchema) -> int:
+#     return int(v.value)
+#
+#
+# class TestStatusEndpoints:
+#     @pytest.fixture(scope="class")
+#     def server_config(self):
+#         server_config = ServerConfig(
+#             num_cores=1, num_workers=1, endpoints=[], loggers={}
+#         )
+#         yield server_config
+#
+#     @pytest.fixture(scope="class")
+#     def client(self, server_config):
+#         yield TestClient(_build_app(server_config))
+#
+#     def test_config(self, server_config, client):
+#         response = client.get("/config")
+#         loaded = ServerConfig(**response.json())
+#         assert loaded == server_config
+#
+#     @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
+#     def test_pings_exist(self, client, route):
+#         response = client.get(route)
+#         assert response.status_code == 200
+#         assert response.json() is True
+#
+#     def test_docs_exist(self, client):
+#         assert client.get("/docs").status_code == 200
+#
+#     def test_home_redirects_to_docs(self, client):
+#         response = client.get("/")
+#         assert response.status_code == 200
+#         assert response.request.path_url == "/docs"
+#         assert len(response.history) > 0
+#         assert response.history[-1].is_redirect
+#
+#
+# class TestMockEndpoints:
+#     @pytest.fixture(scope="class")
+#     def server_config(self):
+#         server_config = ServerConfig(
+#             num_cores=1, num_workers=1, endpoints=[], loggers={}
+#         )
+#         yield server_config
+#
+#     @pytest.fixture(scope="class")
+#     def app(self, server_config):
+#         yield _build_app(server_config)
+#
+#     @pytest.fixture(scope="class")
+#     def client(self, app):
+#         yield TestClient(app)
+#
+#     def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
+#         mock_pipeline = Mock(
+#             side_effect=parse,
+#             input_schema=StrSchema,
+#             output_schema=int,
+#             logger=MultiLogger([]),
+#         )
+#         _add_pipeline_endpoint(
+#             app,
+#             system_logging_config=SystemLoggingConfig(),
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             pipeline=mock_pipeline,
+#         )
+#         assert app.routes[-1].path == "/predict/parse_int"
+#         assert app.routes[-1].response_model is int
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+#         assert app.routes[-1].methods == {"POST"}
+#
+#         for v in ["1234", "5678"]:
+#             response = client.post("/predict/parse_int", json=dict(value=v))
+#             assert response.status_code == 200
+#             assert response.json() == int(v)
+#
+#     def test_add_model_endpoint_with_from_files(self, app):
+#         _add_pipeline_endpoint(
+#             app,
+#             system_logging_config=Mock(),
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+#         )
+#         assert app.routes[-2].path == "/predict/parse_int"
+#         assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
+#         assert app.routes[-1].path == "/predict/parse_int/from_files"
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+#         assert app.routes[-1].response_model is int
+#         assert app.routes[-1].methods == {"POST"}
+#
+#     def test_sagemaker_only_adds_one_endpoint(self, app):
+#         num_routes = len(app.routes)
+#         _add_pipeline_endpoint(
+#             app,
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             system_logging_config=Mock(),
+#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+#             integration="sagemaker",
+#         )
+#         assert len(app.routes) == num_routes + 1
+#         assert app.routes[-1].path == "/invocations"
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+#
+#         num_routes = len(app.routes)
+#         _add_pipeline_endpoint(
+#             app,
+#             endpoint_config=Mock(route="/predict/parse_int"),
+#             system_logging_config=Mock(),
+#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
+#             integration="sagemaker",
+#         )
+#         assert len(app.routes) == num_routes + 1
+#         assert app.routes[-1].path == "/invocations"
+#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+#
+#     def test_add_endpoint_with_no_route_specified(self, app):
+#         _add_pipeline_endpoint(
+#             app,
+#             endpoint_config=Mock(route=None),
+#             system_logging_config=Mock(),
+#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
+#         )
+#         assert app.routes[-1].path == "/predict"
+#
+#
+# class TestActualModelEndpoints:
+#     @pytest.fixture(scope="class")
+#     def client(self):
+#         stub = (
+#             "zoo:nlp/text_classification/distilbert-none/"
+#             "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
+#         )
+#         server_config = ServerConfig(
+#             num_cores=1,
+#             num_workers=1,
+#             endpoints=[
+#                 EndpointConfig(
+#                     route="/predict/dynamic-batch",
+#                     task="text-classification",
+#                     model=stub,
+#                     batch_size=1,
+#                 ),
+#                 EndpointConfig(
+#                     route="/predict/static-batch",
+#                     task="text-classification",
+#                     model=stub,
+#                     batch_size=2,
+#                 ),
+#             ],
+#             loggers={},  # do not instantiate any loggers
+#         )
+#         with mock_engine(rng_seed=0):
+#             app = _build_app(server_config)
+#         yield TestClient(app)
+#
+#     def test_static_batch_errors_on_wrong_batch_size(self, client):
+#         with pytest.raises(
+#             RuntimeError,
+#             match=(
+#                 "batch size of 1 passed into pipeline is "
+#                 "not divisible by model batch size of 2"
+#             ),
+#         ):
+#             client.post("/predict/static-batch", json={"sequences": "today is great"})
+#
+#     def test_static_batch_good_request(self, client):
+#         response = client.post(
+#             "/predict/static-batch",
+#             json={"sequences": ["today is great", "today is terrible"]},
+#         )
+#         assert response.status_code == 200
+#         output = response.json()
+#         assert len(output["labels"]) == 2
+#         assert len(output["scores"]) == 2
+#
+#     @pytest.mark.parametrize(
+#         "seqs",
+#         [
+#             ["today is great"],
+#             ["today is great", "today is terrible"],
+#             ["the first sentence", "the second sentence", "the third sentence"],
+#         ],
+#     )
+#     def test_dynamic_batch_any(self, client, seqs):
+#         response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
+#         assert response.status_code == 200
+#         output = response.json()
+#         assert len(output["labels"]) == len(seqs)
+#         assert len(output["scores"]) == len(seqs)
+#
+#
+# class TestDynamicEndpoints:
+#     @pytest.fixture(scope="class")
+#     def client(self):
+#         server_config = ServerConfig(
+#             num_cores=1, num_workers=1, endpoints=[], loggers=None
+#         )
+#         with mock_engine(rng_seed=0):
+#             app = _build_app(server_config)
+#             yield TestClient(app)
+#
+#
+# @mock_engine(rng_seed=0)
+# def test_dynamic_add_and_remove_endpoint(engine_mock):
+#     server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
+#     app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     # assert /predict doesn't exist
+#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+#
+#     # add /predict
+#     response = client.post(
+#         "/endpoints",
+#         json=EndpointConfig(task="text-classification", model="default").dict(),
+#     )
+#     assert response.status_code == 200
+#     response = client.post("/predict", json=dict(sequences="asdf"))
+#     assert response.status_code == 200
+#
+#     # remove /predict
+#     response = client.delete(
+#         "/endpoints",
+#         json=EndpointConfig(
+#             route="/predict", task="text-classification", model="default"
+#         ).dict(),
+#     )
+#     assert response.status_code == 200
+#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
diff --git a/tests/server/test_loggers.py b/tests/server/test_loggers.py
index 369215e9af..8802835381 100644
--- a/tests/server/test_loggers.py
+++ b/tests/server/test_loggers.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
+import shutil
 from collections import Counter
 from unittest import mock
 
@@ -57,246 +58,247 @@ def test_default_logger():
         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
     ), mock_engine(rng_seed=0):
         app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
-
-
-def test_data_logging_from_predefined():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name="text_classification",
-                model=stub,
-                add_predefined=[MetricFunctionConfig(func="text_classification")],
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-    client.post(
-        "/predict",
-        json={
-            "sequences": [["Fun for adults and children.", "Fun for only children."]]
-        },
-    )
-    calls = fetch_leaf_logger(server_logger).calls
-    data_logging_logs = [call for call in calls if "DATA" in call]
-    with open(
-        "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
-        "r",
-    ) as f:
-        expected_logs = f.read().splitlines()
-    for log, expected_log in zip(data_logging_logs, expected_logs):
-        assert log == expected_log
-
-
-@flaky(max_runs=4, min_passes=3)
-def test_logging_only_system_info():
-    server_config = ServerConfig(
-        endpoints=[EndpointConfig(task=task, name=name, model=stub)],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    _test_logger_contents(
-        fetch_leaf_logger(server_logger),
-        {"prediction_latency": 8},
-    )
-
-
-def test_regex_target_logging():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name=name,
-                data_logging={
-                    "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
-                },
-                model=stub,
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    _test_logger_contents(
-        fetch_leaf_logger(server_logger),
-        {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
-    )
-
-
-def test_multiple_targets_logging():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name=name,
-                data_logging={
-                    "pipeline_inputs.sequences": [
-                        MetricFunctionConfig(func="identity")
-                    ],
-                    "engine_inputs": [MetricFunctionConfig(func="identity")],
-                },
-                model=stub,
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-    _test_logger_contents(
-        fetch_leaf_logger(server_logger),
-        {
-            "pipeline_inputs.sequences__identity": 2,
-            "engine_inputs__identity": 2,
-            "prediction_latency": 8,
-        },
-    )
-
-
-@flaky(max_runs=3, min_passes=2)
-def test_function_metric_with_target_loggers():
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task,
-                name=name,
-                data_logging={
-                    "pipeline_inputs.sequences[0]": [
-                        MetricFunctionConfig(
-                            func="identity", target_loggers=["logger_1"]
-                        )
-                    ],
-                    "engine_inputs": [MetricFunctionConfig(func="identity")],
-                },
-                model=stub,
-            )
-        ],
-        loggers={
-            "logger_1": {"path": logger_identifier},
-            "logger_2": {"path": logger_identifier},
-        },
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
+    # client = TestClient(app)
+    #
+    # for _ in range(2):
+    #     client.post("/predict", json={"sequences": "today is great"})
+    # assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
 
-    for _ in range(2):
-        client.post("/predict", json={"sequences": "today is great"})
-
-    _test_logger_contents(
-        server_logger.logger.loggers[1].logger.loggers[0],
-        {
-            "pipeline_inputs.sequences__identity": 2,
-            "engine_inputs__identity": 2,
-            "prediction_latency": 8,
-        },
-    )
-    _test_logger_contents(
-        server_logger.logger.loggers[1].logger.loggers[1],
-        {
-            "pipeline_inputs.sequences__identity": 0,
-            "engine_inputs__identity": 2,
-            "prediction_latency": 8,
-        },
-    )
 
-
-@mock_engine(rng_seed=0)
-def test_instantiate_prometheus(tmp_path):
-    client = TestClient(
-        _build_app(
-            ServerConfig(
-                endpoints=[EndpointConfig(task="text_classification", model="default")],
-                loggers=dict(
-                    prometheus={
-                        "port": find_free_port(),
-                        "text_log_save_dir": str(tmp_path),
-                        "text_log_save_frequency": 30,
-                    }
-                ),
-            )
-        )
-    )
-    r = client.post("/predict", json=dict(sequences="asdf"))
-    assert r.status_code == 200
-
-
-@mock_engine(rng_seed=0)
-def test_endpoint_system_logging(tmp_path):
-    server_config = ServerConfig(
-        system_logging=ServerSystemLoggingConfig(
-            request_details=SystemLoggingGroup(enable=True),
-            resource_utilization=SystemLoggingGroup(enable=True),
-        ),
-        endpoints=[
-            EndpointConfig(
-                task="text_classification",
-                model="default",
-                route="/predict_text_classification",
-                logging_config=PipelineSystemLoggingConfig(
-                    inference_details=SystemLoggingGroup(enable=True),
-                    prediction_latency=SystemLoggingGroup(enable=True),
-                ),
-            ),
-            EndpointConfig(
-                task="question_answering",
-                model="default",
-                route="/predict_question_answering",
-                logging_config=PipelineSystemLoggingConfig(
-                    inference_details=SystemLoggingGroup(enable=True),
-                    prediction_latency=SystemLoggingGroup(enable=True),
-                ),
-            ),
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-    client.post("/predict_text_classification", json=dict(sequences="asdf"))
-    client.post(
-        "/predict_text_classification", json=dict(question="asdf", context="asdf")
-    )
-    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-
-    c = Counter([call.split(",")[0] for call in calls])
-
-    assert c == SAMPLE_LOGS_DICT
+# def test_data_logging_from_predefined():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name="text_classification",
+#                 model=stub,
+#                 add_predefined=[MetricFunctionConfig(func="text_classification")],
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#     client.post(
+#         "/predict",
+#         json={
+#             "sequences": [["Fun for adults and children.", "Fun for only children."]]
+#         },
+#     )
+#     calls = fetch_leaf_logger(server_logger).calls
+#     data_logging_logs = [call for call in calls if "DATA" in call]
+#     with open(
+#         "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
+#         "r",
+#     ) as f:
+#         expected_logs = f.read().splitlines()
+#     for log, expected_log in zip(data_logging_logs, expected_logs):
+#         assert log == expected_log
+#
+#
+# @flaky(max_runs=4, min_passes=3)
+# def test_logging_only_system_info():
+#     server_config = ServerConfig(
+#         endpoints=[EndpointConfig(task=task, name=name, model=stub)],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#     _test_logger_contents(
+#         fetch_leaf_logger(server_logger),
+#         {"prediction_latency": 8},
+#     )
+#
+#
+# def test_regex_target_logging():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name=name,
+#                 data_logging={
+#                     "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
+#                 },
+#                 model=stub,
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#     _test_logger_contents(
+#         fetch_leaf_logger(server_logger),
+#         {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
+#     )
+#
+#
+# def test_multiple_targets_logging():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name=name,
+#                 data_logging={
+#                     "pipeline_inputs.sequences": [
+#                         MetricFunctionConfig(func="identity")
+#                     ],
+#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
+#                 },
+#                 model=stub,
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#     _test_logger_contents(
+#         fetch_leaf_logger(server_logger),
+#         {
+#             "pipeline_inputs.sequences__identity": 2,
+#             "engine_inputs__identity": 2,
+#             "prediction_latency": 8,
+#         },
+#     )
+#
+#
+# @flaky(max_runs=3, min_passes=2)
+# def test_function_metric_with_target_loggers():
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task,
+#                 name=name,
+#                 data_logging={
+#                     "pipeline_inputs.sequences[0]": [
+#                         MetricFunctionConfig(
+#                             func="identity", target_loggers=["logger_1"]
+#                         )
+#                     ],
+#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
+#                 },
+#                 model=stub,
+#             )
+#         ],
+#         loggers={
+#             "logger_1": {"path": logger_identifier},
+#             "logger_2": {"path": logger_identifier},
+#         },
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#
+#     for _ in range(2):
+#         client.post("/predict", json={"sequences": "today is great"})
+#
+#     _test_logger_contents(
+#         server_logger.logger.loggers[1].logger.loggers[0],
+#         {
+#             "pipeline_inputs.sequences__identity": 2,
+#             "engine_inputs__identity": 2,
+#             "prediction_latency": 8,
+#         },
+#     )
+#     _test_logger_contents(
+#         server_logger.logger.loggers[1].logger.loggers[1],
+#         {
+#             "pipeline_inputs.sequences__identity": 0,
+#             "engine_inputs__identity": 2,
+#             "prediction_latency": 8,
+#         },
+#     )
+#
+#
+# @mock_engine(rng_seed=0)
+# def test_instantiate_prometheus(mock_engine, tmp_path):
+#     client = TestClient(
+#         _build_app(
+#             ServerConfig(
+#                 endpoints=[EndpointConfig(task="text_classification", model="default")],
+#                 loggers=dict(
+#                     prometheus={
+#                         "port": find_free_port(),
+#                         "text_log_save_dir": tmp_path.name,
+#                         "text_log_save_frequency": 30,
+#                     }
+#                 ),
+#             )
+#         )
+#     )
+#     r = client.post("/predict", json=dict(sequences="asdf"))
+#     assert r.status_code == 200
+#     shutil.rmtree(tmp_path.name, ignore_errors=True)
+#
+#
+# @mock_engine(rng_seed=0)
+# def test_endpoint_system_logging(mock_engine):
+#     server_config = ServerConfig(
+#         system_logging=ServerSystemLoggingConfig(
+#             request_details=SystemLoggingGroup(enable=True),
+#             resource_utilization=SystemLoggingGroup(enable=True),
+#         ),
+#         endpoints=[
+#             EndpointConfig(
+#                 task="text_classification",
+#                 model="default",
+#                 route="/predict_text_classification",
+#                 logging_config=PipelineSystemLoggingConfig(
+#                     inference_details=SystemLoggingGroup(enable=True),
+#                     prediction_latency=SystemLoggingGroup(enable=True),
+#                 ),
+#             ),
+#             EndpointConfig(
+#                 task="question_answering",
+#                 model="default",
+#                 route="/predict_question_answering",
+#                 logging_config=PipelineSystemLoggingConfig(
+#                     inference_details=SystemLoggingGroup(enable=True),
+#                     prediction_latency=SystemLoggingGroup(enable=True),
+#                 ),
+#             ),
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine:
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#     client.post("/predict_text_classification", json=dict(sequences="asdf"))
+#     client.post(
+#         "/predict_text_classification", json=dict(question="asdf", context="asdf")
+#     )
+#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+#
+#     c = Counter([call.split(",")[0] for call in calls])
+#
+#     assert c == SAMPLE_LOGS_DICT
diff --git a/tests/server/test_system_logging.py b/tests/server/test_system_logging.py
index b6a3fbd2b6..bd0a8a3ae3 100644
--- a/tests/server/test_system_logging.py
+++ b/tests/server/test_system_logging.py
@@ -1,169 +1,169 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest import mock
-
-import pytest
-from deepsparse.loggers.config import SystemLoggingGroup
-from deepsparse.server.config import (
-    EndpointConfig,
-    ServerConfig,
-    ServerSystemLoggingConfig,
-)
-from deepsparse.server.helpers import server_logger_from_config
-from deepsparse.server.server import _build_app
-from deepsparse.server.system_logging import log_resource_utilization
-from fastapi.testclient import TestClient
-from tests.deepsparse.loggers.helpers import ListLogger
-from tests.utils import mock_engine
-
-
-logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
-stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
-task = "text-classification"
-name = "endpoint_name"
-
-
-def _test_successful_requests(calls, successful_request):
-    relevant_call = [
-        call
-        for call in calls
-        if call.startswith("identifier:request_details/successful_request_count")
-    ]
-    assert len(relevant_call) == 1
-    relevant_call = relevant_call[0]
-    value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
-    assert value == successful_request
-
-
-def _test_response_msg(calls, response_msg):
-    relevant_call = [
-        call
-        for call in calls
-        if call.startswith("identifier:request_details/response_message")
-    ]
-    assert len(relevant_call) == 1
-    relevant_call = relevant_call[0]
-    value = relevant_call.split("value:")[1].split(",")[0]
-    assert value == response_msg
-
-
-@pytest.mark.parametrize(
-    "json_payload, input_batch_size, successful_request, response_msg",
-    [
-        ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
-        (
-            {"sequences": ["today is great", "today is great"]},
-            2,
-            True,
-            "Response status code: 200",
-        ),
-        ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
-    ],
-)
-def test_log_request_details(
-    json_payload, input_batch_size, successful_request, response_msg
-):
-    server_config = ServerConfig(
-        endpoints=[
-            EndpointConfig(
-                task=task, name=name, model=stub, batch_size=input_batch_size
-            )
-        ],
-        loggers={"logger_1": {"path": logger_identifier}},
-        system_logging=ServerSystemLoggingConfig(
-            request_details=SystemLoggingGroup(enable=True)
-        ),
-    )
-    server_logger = server_logger_from_config(server_config)
-    with mock.patch(
-        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-    ), mock_engine(rng_seed=0):
-        app = _build_app(server_config)
-    client = TestClient(app)
-    client.post("/predict", json=json_payload)
-
-    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-
-    _test_successful_requests(calls, successful_request)
-    _test_response_msg(calls, response_msg)
-
-
-def _test_cpu_utilization(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
-    ]
-    assert len(relevant_calls) == num_iterations
-
-
-def _test_memory_utilization(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith("identifier:resource_utilization/memory_utilization_percent")
-    ]
-    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-    assert len(relevant_calls) == num_iterations
-    # memory utilization is a percentage, so it should be between 0 and 100
-    assert all(0.0 < value < 100.0 for value in values)
-
-
-def _test_total_memory_available(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith(
-            "identifier:resource_utilization/total_memory_available_bytes"
-        )
-    ]
-    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-    assert len(relevant_calls) == num_iterations
-    # assert all values are the same (total memory available is constant)
-    assert all(value == values[0] for value in values)
-
-
-def _test_additional_items_to_log(calls, num_iterations):
-    relevant_calls = [
-        call
-        for call in calls
-        if call.startswith("identifier:resource_utilization/test")
-    ]
-    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-    assert len(relevant_calls) == num_iterations
-    # assert all values are the same ({"test" : 1} is constant)
-    assert all(value == 1 for value in values)
-
-
-@pytest.mark.parametrize(
-    "num_iterations, additional_items_to_log",
-    [
-        (5, {}),
-        (3, {"test": 1}),
-    ],
-)
-def test_log_resource_utilization(num_iterations, additional_items_to_log):
-    server_logger = ListLogger()
-
-    for iter in range(num_iterations):
-        log_resource_utilization(
-            server_logger, prefix="resource_utilization", **additional_items_to_log
-        )
-
-    calls = server_logger.calls
-
-    _test_cpu_utilization(calls, num_iterations)
-    _test_memory_utilization(calls, num_iterations)
-    _test_total_memory_available(calls, num_iterations)
+# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #    http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing,
+# # software distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+#
+# from unittest import mock
+#
+# import pytest
+# from deepsparse.loggers.config import SystemLoggingGroup
+# from deepsparse.server.config import (
+#     EndpointConfig,
+#     ServerConfig,
+#     ServerSystemLoggingConfig,
+# )
+# from deepsparse.server.helpers import server_logger_from_config
+# from deepsparse.server.server import _build_app
+# from deepsparse.server.system_logging import log_resource_utilization
+# from fastapi.testclient import TestClient
+# from tests.deepsparse.loggers.helpers import ListLogger
+# from tests.utils import mock_engine
+#
+#
+# logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
+# stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
+# task = "text-classification"
+# name = "endpoint_name"
+#
+#
+# def _test_successful_requests(calls, successful_request):
+#     relevant_call = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:request_details/successful_request_count")
+#     ]
+#     assert len(relevant_call) == 1
+#     relevant_call = relevant_call[0]
+#     value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
+#     assert value == successful_request
+#
+#
+# def _test_response_msg(calls, response_msg):
+#     relevant_call = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:request_details/response_message")
+#     ]
+#     assert len(relevant_call) == 1
+#     relevant_call = relevant_call[0]
+#     value = relevant_call.split("value:")[1].split(",")[0]
+#     assert value == response_msg
+#
+#
+# @pytest.mark.parametrize(
+#     "json_payload, input_batch_size, successful_request, response_msg",
+#     [
+#         ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
+#         (
+#             {"sequences": ["today is great", "today is great"]},
+#             2,
+#             True,
+#             "Response status code: 200",
+#         ),
+#         ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
+#     ],
+# )
+# def test_log_request_details(
+#     json_payload, input_batch_size, successful_request, response_msg
+# ):
+#     server_config = ServerConfig(
+#         endpoints=[
+#             EndpointConfig(
+#                 task=task, name=name, model=stub, batch_size=input_batch_size
+#             )
+#         ],
+#         loggers={"logger_1": {"path": logger_identifier}},
+#         system_logging=ServerSystemLoggingConfig(
+#             request_details=SystemLoggingGroup(enable=True)
+#         ),
+#     )
+#     server_logger = server_logger_from_config(server_config)
+#     with mock.patch(
+#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+#     ), mock_engine(rng_seed=0):
+#         app = _build_app(server_config)
+#     client = TestClient(app)
+#     client.post("/predict", json=json_payload)
+#
+#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+#
+#     _test_successful_requests(calls, successful_request)
+#     _test_response_msg(calls, response_msg)
+#
+#
+# def _test_cpu_utilization(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
+#     ]
+#     assert len(relevant_calls) == num_iterations
+#
+#
+# def _test_memory_utilization(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:resource_utilization/memory_utilization_percent")
+#     ]
+#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+#     assert len(relevant_calls) == num_iterations
+#     # memory utilization is a percentage, so it should be between 0 and 100
+#     assert all(0.0 < value < 100.0 for value in values)
+#
+#
+# def _test_total_memory_available(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith(
+#             "identifier:resource_utilization/total_memory_available_bytes"
+#         )
+#     ]
+#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+#     assert len(relevant_calls) == num_iterations
+#     # assert all values are the same (total memory available is constant)
+#     assert all(value == values[0] for value in values)
+#
+#
+# def _test_additional_items_to_log(calls, num_iterations):
+#     relevant_calls = [
+#         call
+#         for call in calls
+#         if call.startswith("identifier:resource_utilization/test")
+#     ]
+#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+#     assert len(relevant_calls) == num_iterations
+#     # assert all values are the same ({"test" : 1} is constant)
+#     assert all(value == 1 for value in values)
+#
+#
+# @pytest.mark.parametrize(
+#     "num_iterations, additional_items_to_log",
+#     [
+#         (5, {}),
+#         (3, {"test": 1}),
+#     ],
+# )
+# def test_log_resource_utilization(num_iterations, additional_items_to_log):
+#     server_logger = ListLogger()
+#
+#     for iter in range(num_iterations):
+#         log_resource_utilization(
+#             server_logger, prefix="resource_utilization", **additional_items_to_log
+#         )
+#
+#     calls = server_logger.calls
+#
+#     _test_cpu_utilization(calls, num_iterations)
+#     _test_memory_utilization(calls, num_iterations)
+#     _test_total_memory_available(calls, num_iterations)

From c117788fb9f256585a31c4d11741d630cfe136bc Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 7 Jun 2023 16:14:46 +0000
Subject: [PATCH 43/68] ready to review

---
 src/deepsparse/transformers/helpers.py |  44 +-
 src/deepsparse/utils/onnx.py           |  30 +-
 src/deepsparse/yolo/utils/utils.py     |   7 +
 tests/server/test_app.py               | 332 +++++++--------
 tests/server/test_config.py            | 444 ++++++++++----------
 tests/server/test_endpoints.py         | 536 ++++++++++++-------------
 tests/server/test_loggers.py           | 486 +++++++++++-----------
 tests/server/test_system_logging.py    | 338 ++++++++--------
 8 files changed, 1110 insertions(+), 1107 deletions(-)

diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 847a7a9924..83b519baa5 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -135,7 +135,6 @@ def overwrite_transformer_onnx_model_inputs(
     path: str,
     batch_size: int = 1,
     max_length: int = 128,
-    output_path: Optional[str] = None,
     inplace: bool = True,
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
@@ -146,25 +145,16 @@ def overwrite_transformer_onnx_model_inputs(
     :param path: path to the ONNX model to override
     :param batch_size: batch size to set
     :param max_length: max sequence length to set
-    :param output_path: if provided, the model will be saved to the given path,
-        otherwise, the model will be saved to a named temporary file that will
-        be deleted after the program exits
-    :param inplace: if True, the model will be modified in place, otherwise
-        a copy of the model will be saved to a temporary file
-    :return: if no output path, a tuple of the saved path to the model, list of
-        model input names, and reference to the tempfile object will be returned
-        otherwise, only the model input names will be returned
+    :param inplace: if True, the model will be modified in place (its inputs will
+        be overwritten). Else, a copy of that model, with overwritten inputs,
+        will be saved to a temporary file
+    :return: tuple of (path to the overwritten model, list of input names that were
+        overwritten, and a temporary file containing the overwritten model if
+        `inplace=False`, else None)
     """
-
-    if inplace and output_path is not None:
-        raise ValueError(
-            "Cannot specify both inplace=True and output_path. If inplace=True, "
-            "the model will be modified in place (the returned path will be identical"
-            "to the input path specified in argument `path`)"
-        )
-    if inplace:
-        output_path = path
     # overwrite input shapes
+    # if > 2Gb model is to be modified in-place, operate
+    # exclusively on the model graph
     model = onnx.load(path, load_external_data=not inplace)
     initializer_input_names = set([node.name for node in model.graph.initializer])
     external_inputs = [
@@ -177,14 +167,20 @@ def overwrite_transformer_onnx_model_inputs(
         input_names.append(external_input.name)
 
     # Save modified model
-    if not inplace:
-        tmp_file = NamedTemporaryFile()  # file will be deleted after program exit
+    if inplace:
+        _LOGGER.info(
+            f"Overwriting in-place the input shapes of the transformer model at {path}"
+        )
+        save_onnx(model, path)
+        return path, input_names, None
+    else:
+        tmp_file = NamedTemporaryFile()
+        _LOGGER.info(
+            f"Saving a copy of the transformer model: {path} "
+            f"with overwritten input shapes to {tmp_file.name}"
+        )
         save_onnx(model, tmp_file.name)
         return tmp_file.name, input_names, tmp_file
-    else:
-        save_onnx(model, output_path)
-
-        return output_path, input_names, None
 
 
 def _get_file_parent(file_path: str) -> str:
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 00f5f24233..eb31179bc9 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -24,7 +24,7 @@
 from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
 
 from deepsparse.utils.extractor import Extractor
-from sparsezoo.utils import onnx_includes_external_data, save_onnx, validate_onnx
+from sparsezoo.utils import save_onnx, validate_onnx
 
 
 try:
@@ -60,21 +60,15 @@ def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) ->
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
-
-    if not onnx_includes_external_data(model) and with_external_data:
-        raise ValueError(
-            "Model does not include external data, it only includes the model graph."
-            "Cannot save its external data to separate a file."
-            "Set argument `with_external_data`=False"
-        )
     shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
-    _LOGGER.warning(f"Saving model to temporary directory: {tempfile.tempdir}")
+    _LOGGER.info(f"Saving model to temporary directory: {tempfile.tempdir}")
 
     if with_external_data:
         external_data = os.path.join(
             tempfile.tempdir, next(tempfile._get_candidate_names())
         )
         has_external_data = save_onnx(model, shaped_model.name, external_data)
+        _LOGGER.info(f"Saving external data to temporary directory: {external_data}")
     else:
         has_external_data = save_onnx(model, shaped_model.name)
     try:
@@ -218,7 +212,7 @@ def override_onnx_batch_size(
         external data are saved along the model graph.
     :param batch_size: Override for the batch size dimension
     :param inplace: If True, overwrite the original model file.
-        Else save the modified model to a temporary file.
+        Else, save the modified model to a temporary file.
     :return: File path to modified ONNX model.
         If inplace is True,
         the modified model will be saved to the same path as the original
@@ -234,12 +228,13 @@ def override_onnx_batch_size(
     for external_input in external_inputs:
         external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
 
-    # Save modified model, this will be cleaned up when context is exited
     if inplace:
+        _LOGGER.info(
+            f"Overwriting in-place the batch size of the model at {onnx_filepath}"
+        )
         save_onnx(model, onnx_filepath)
         return onnx_filepath
     else:
-        # Save modified model, this will be cleaned up when context is exited
         return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
@@ -302,12 +297,17 @@ def override_onnx_input_shapes(
         for dim_idx, dim in enumerate(external_input.type.tensor_type.shape.dim):
             dim.dim_value = input_shapes[input_idx][dim_idx]
 
-    # Save modified model, this will be cleaned up when context is exited
     if inplace:
+        _LOGGER.info(
+            "Overwriting in-place the input shapes of the model " f"at {onnx_filepath}"
+        )
         onnx.save(model, onnx_filepath)
         return onnx_filepath
     else:
-        # Save modified model, this will be cleaned up when context is exited
+        _LOGGER.info(
+            f"Saving the input shapes of the model at {onnx_filepath} "
+            f"to a temporary file"
+        )
         return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
@@ -387,7 +387,7 @@ def truncate_onnx_model(
             output.type.tensor_type.shape.Clear()
 
     # save and check model
-    _LOGGER.info("Saving truncated model to %s", output_filepath)
+    _LOGGER.debug(f"Saving truncated model to {output_filepath}")
     save_onnx(extracted_model, output_filepath, "external_data")
     validate_onnx(output_filepath)
 
diff --git a/src/deepsparse/yolo/utils/utils.py b/src/deepsparse/yolo/utils/utils.py
index 3a0f596fe1..e778fabe17 100644
--- a/src/deepsparse/yolo/utils/utils.py
+++ b/src/deepsparse/yolo/utils/utils.py
@@ -405,9 +405,16 @@ def modify_yolo_onnx_input_shape(
         set_tensor_dim_shape(model.graph.output[0], 1, num_predictions)
 
     if inplace:
+        _LOGGER.info(
+            "Overwriting in-place the ONNX model "
+            f"at {model_path} with the new input shape"
+        )
         save_onnx(model, model_path)
         return model_path
     else:
+        _LOGGER.info(
+            "Saving the ONNX model with the " "new input shape to a temporary file"
+        )
         return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
diff --git a/tests/server/test_app.py b/tests/server/test_app.py
index 678152adc9..9bc71e1a36 100644
--- a/tests/server/test_app.py
+++ b/tests/server/test_app.py
@@ -1,166 +1,166 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# import os
-# from copy import deepcopy
-# from re import escape
-# from unittest.mock import patch
-#
-# import pytest
-# from deepsparse.server.config import EndpointConfig, ServerConfig
-# from deepsparse.server.server import _build_app
-#
-#
-# def test_add_multiple_endpoints_with_no_route():
-#     with pytest.raises(
-#         ValueError,
-#         match=(
-#             "must specify `route` for all endpoints if multiple endpoints are used."
-#         ),
-#     ):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 endpoints=[
-#                     EndpointConfig(task="", model="", route=None),
-#                     EndpointConfig(task="", model="", route=None),
-#                 ],
-#                 loggers={},
-#             )
-#         )
-#
-#
-# def test_add_multiple_endpoints_with_same_route():
-#     with pytest.raises(ValueError, match="asdf specified 2 times"):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 endpoints=[
-#                     EndpointConfig(task="", model="", route="asdf"),
-#                     EndpointConfig(task="", model="", route="asdf"),
-#                 ],
-#                 loggers={},
-#             )
-#         )
-#
-#
-# def test_invalid_integration():
-#     with pytest.raises(
-#         ValueError,
-#         match=escape(
-#             "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
-#         ),
-#     ):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 integration="asdf",
-#                 endpoints=[],
-#                 loggers={},
-#             )
-#         )
-#
-#
-# def test_pytorch_num_threads():
-#     torch = pytest.importorskip("torch")
-#
-#     orig_num_threads = torch.get_num_threads()
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             pytorch_num_threads=None,
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert torch.get_num_threads() == orig_num_threads
-#
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             pytorch_num_threads=1,
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert torch.get_num_threads() == 1
-#
-#
-# @patch.dict(os.environ, deepcopy(os.environ))
-# def test_thread_pinning_none():
-#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             engine_thread_pinning="none",
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-#
-#
-# @patch.dict(os.environ, deepcopy(os.environ))
-# def test_thread_pinning_numa():
-#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             engine_thread_pinning="numa",
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
-#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
-#
-#
-# @patch.dict(os.environ, deepcopy(os.environ))
-# def test_thread_pinning_cores():
-#     os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
-#     os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
-#     _build_app(
-#         ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             engine_thread_pinning="core",
-#             endpoints=[],
-#             loggers={},
-#         )
-#     )
-#     assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
-#     assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
-#
-#
-# def test_invalid_thread_pinning():
-#     with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
-#         _build_app(
-#             ServerConfig(
-#                 num_cores=1,
-#                 num_workers=1,
-#                 engine_thread_pinning="asdf",
-#                 endpoints=[],
-#                 loggers={},
-#             )
-#         )
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from copy import deepcopy
+from re import escape
+from unittest.mock import patch
+
+import pytest
+from deepsparse.server.config import EndpointConfig, ServerConfig
+from deepsparse.server.server import _build_app
+
+
+def test_add_multiple_endpoints_with_no_route():
+    with pytest.raises(
+        ValueError,
+        match=(
+            "must specify `route` for all endpoints if multiple endpoints are used."
+        ),
+    ):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                endpoints=[
+                    EndpointConfig(task="", model="", route=None),
+                    EndpointConfig(task="", model="", route=None),
+                ],
+                loggers={},
+            )
+        )
+
+
+def test_add_multiple_endpoints_with_same_route():
+    with pytest.raises(ValueError, match="asdf specified 2 times"):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                endpoints=[
+                    EndpointConfig(task="", model="", route="asdf"),
+                    EndpointConfig(task="", model="", route="asdf"),
+                ],
+                loggers={},
+            )
+        )
+
+
+def test_invalid_integration():
+    with pytest.raises(
+        ValueError,
+        match=escape(
+            "Unknown integration field asdf. Expected one of ['local', 'sagemaker']"
+        ),
+    ):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                integration="asdf",
+                endpoints=[],
+                loggers={},
+            )
+        )
+
+
+def test_pytorch_num_threads():
+    torch = pytest.importorskip("torch")
+
+    orig_num_threads = torch.get_num_threads()
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            pytorch_num_threads=None,
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert torch.get_num_threads() == orig_num_threads
+
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            pytorch_num_threads=1,
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert torch.get_num_threads() == 1
+
+
+@patch.dict(os.environ, deepcopy(os.environ))
+def test_thread_pinning_none():
+    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            engine_thread_pinning="none",
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+
+
+@patch.dict(os.environ, deepcopy(os.environ))
+def test_thread_pinning_numa():
+    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            engine_thread_pinning="numa",
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "0"
+    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "1"
+
+
+@patch.dict(os.environ, deepcopy(os.environ))
+def test_thread_pinning_cores():
+    os.environ.pop("NM_BIND_THREADS_TO_CORES", None)
+    os.environ.pop("NM_BIND_THREADS_TO_SOCKETS", None)
+    _build_app(
+        ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            engine_thread_pinning="core",
+            endpoints=[],
+            loggers={},
+        )
+    )
+    assert os.environ["NM_BIND_THREADS_TO_CORES"] == "1"
+    assert os.environ["NM_BIND_THREADS_TO_SOCKETS"] == "0"
+
+
+def test_invalid_thread_pinning():
+    with pytest.raises(ValueError, match='Expected one of {"core","numa","none"}.'):
+        _build_app(
+            ServerConfig(
+                num_cores=1,
+                num_workers=1,
+                engine_thread_pinning="asdf",
+                endpoints=[],
+                loggers={},
+            )
+        )
diff --git a/tests/server/test_config.py b/tests/server/test_config.py
index f2f9b0e6fe..b1c1c75a84 100644
--- a/tests/server/test_config.py
+++ b/tests/server/test_config.py
@@ -1,222 +1,222 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# import yaml
-#
-# import pytest
-# from deepsparse.server.config import (
-#     EndpointConfig,
-#     ImageSizesConfig,
-#     MetricFunctionConfig,
-#     SequenceLengthsConfig,
-#     ServerConfig,
-# )
-#
-#
-# def test_no_bucketing_config():
-#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
-#     assert cfg.input_shapes is None
-#     assert cfg.kwargs == {}
-#
-#
-# @pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
-# def test_bucketing_sequence_length_for_cv(task):
-#     with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
-#         EndpointConfig(
-#             task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-#         ).to_pipeline_config()
-#
-#
-# @pytest.mark.parametrize(
-#     "task", ["question_answering", "text_classification", "token_classification"]
-# )
-# def test_bucketing_image_size_for_nlp(task):
-#     with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
-#         EndpointConfig(
-#             task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
-#         ).to_pipeline_config()
-#
-#
-# def test_bucketing_zero_sequence_length():
-#     with pytest.raises(ValueError, match="at least one sequence length"):
-#         EndpointConfig(
-#             task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
-#         ).to_pipeline_config()
-#
-#
-# def test_bucketing_zero_image_size():
-#     with pytest.raises(ValueError, match="at least one image size"):
-#         EndpointConfig(
-#             task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
-#         ).to_pipeline_config()
-#
-#
-# def test_bucketing_one_sequence_length():
-#     cfg = EndpointConfig(
-#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
-#     ).to_pipeline_config()
-#     assert cfg.input_shapes is None
-#     assert cfg.kwargs == {"sequence_length": 32}
-#
-#
-# def test_bucketing_multi_sequence_length():
-#     cfg = EndpointConfig(
-#         task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
-#     ).to_pipeline_config()
-#     assert cfg.input_shapes is None
-#     assert cfg.kwargs == {"sequence_length": [32, 64]}
-#
-#
-# def test_bucketing_one_image_size():
-#     cfg = EndpointConfig(
-#         task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
-#     ).to_pipeline_config()
-#     assert cfg.input_shapes == [[256, 256]]
-#     assert cfg.kwargs == {}
-#
-#
-# def test_endpoint_config_to_pipeline_copy_fields():
-#     cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
-#     assert cfg.task == "qa"
-#     assert cfg.model_path == "zxcv"
-#
-#     cfg = EndpointConfig(task="", model="").to_pipeline_config()
-#     assert cfg.batch_size == 1
-#
-#     cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
-#     assert cfg.batch_size == 64
-#
-#
-# def test_yaml_load_config(tmp_path):
-#     server_config = ServerConfig(
-#         num_cores=1,
-#         num_workers=2,
-#         integration="sagemaker",
-#         endpoints=[
-#             EndpointConfig(
-#                 name="asdf",
-#                 route="qwer",
-#                 task="uiop",
-#                 model="hjkl",
-#                 batch_size=1,
-#                 bucketing=None,
-#             ),
-#             EndpointConfig(
-#                 name="asdfd",
-#                 route="qwer",
-#                 task="uiop",
-#                 model="hjkl",
-#                 batch_size=2,
-#                 bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
-#             ),
-#             EndpointConfig(
-#                 name="asdfde",
-#                 route="qwer",
-#                 task="uiop",
-#                 model="hjkl",
-#                 batch_size=3,
-#                 bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
-#             ),
-#         ],
-#         loggers={},
-#     )
-#
-#     path = tmp_path / "config.yaml"
-#     with open(path, "w") as fp:
-#         yaml.dump(server_config.dict(), fp)
-#
-#     with open(path) as fp:
-#         obj = yaml.load(fp, Loader=yaml.Loader)
-#     server_config2 = ServerConfig(**obj)
-#     assert server_config == server_config2
-#
-#
-# metric_function_config_yaml_1 = """
-#   func: identity
-#   frequency: 5
-#   loggers:
-#     - python"""
-#
-# metric_function_config_yaml_2 = """
-#   func: numpy.max"""
-#
-# metric_function_config_yaml_3 = """
-#   func: numpy.max
-#   frequency: 0"""
-#
-#
-# @pytest.mark.parametrize(
-#     "config_yaml, should_fail, instance_type",
-#     [
-#         (metric_function_config_yaml_1, False, MetricFunctionConfig),
-#         (metric_function_config_yaml_2, False, MetricFunctionConfig),
-#         (
-#             metric_function_config_yaml_3,
-#             True,
-#             MetricFunctionConfig,
-#         ),  # frequency cannot be zero
-#     ],
-# )
-# def test_function_logging_config(config_yaml, should_fail, instance_type):
-#     obj = yaml.safe_load(config_yaml)
-#     if should_fail:
-#         with pytest.raises(Exception):
-#             MetricFunctionConfig(**obj)
-#     else:
-#         assert MetricFunctionConfig(**obj)
-#
-#
-# def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
-#     return ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 name=endpoint_1_name,
-#                 task=task_name,
-#                 model="hjkl",
-#             ),
-#             EndpointConfig(
-#                 name=endpoint_2_name,
-#                 task=task_name,
-#                 model="hjkl",
-#             ),
-#         ]
-#     )
-#
-#
-# @pytest.mark.parametrize(
-#     "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
-#     [
-#         ("some_task", None, None, False, "some_task-0", "some_task-1"),
-#         ("some_task", "name_1", None, False, "name_1", "some_task-0"),
-#         ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
-#         ("some_task", "name_1", "name_1", True, None, None),
-#     ],
-# )
-# def test_unique_endpoint_names(
-#     task_name,
-#     endpoint_1_name,
-#     endpoint_2_name,
-#     raise_error,
-#     expected_endpoint_1_name,
-#     expected_endpoint_2_name,
-# ):
-#     if raise_error:
-#         with pytest.raises(ValueError):
-#             _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-#             return
-#         return
-#
-#     server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
-#     assert server_config.endpoints[0].name == expected_endpoint_1_name
-#     assert server_config.endpoints[1].name == expected_endpoint_2_name
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+
+import pytest
+from deepsparse.server.config import (
+    EndpointConfig,
+    ImageSizesConfig,
+    MetricFunctionConfig,
+    SequenceLengthsConfig,
+    ServerConfig,
+)
+
+
+def test_no_bucketing_config():
+    cfg = EndpointConfig(task="", model="").to_pipeline_config()
+    assert cfg.input_shapes is None
+    assert cfg.kwargs == {}
+
+
+@pytest.mark.parametrize("task", ["yolo", "yolact", "image_classification"])
+def test_bucketing_sequence_length_for_cv(task):
+    with pytest.raises(ValueError, match=f"for non-nlp task {task}"):
+        EndpointConfig(
+            task=task, model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+        ).to_pipeline_config()
+
+
+@pytest.mark.parametrize(
+    "task", ["question_answering", "text_classification", "token_classification"]
+)
+def test_bucketing_image_size_for_nlp(task):
+    with pytest.raises(ValueError, match=f"for non computer vision task {task}"):
+        EndpointConfig(
+            task=task, model="", bucketing=ImageSizesConfig(image_sizes=[])
+        ).to_pipeline_config()
+
+
+def test_bucketing_zero_sequence_length():
+    with pytest.raises(ValueError, match="at least one sequence length"):
+        EndpointConfig(
+            task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[])
+        ).to_pipeline_config()
+
+
+def test_bucketing_zero_image_size():
+    with pytest.raises(ValueError, match="at least one image size"):
+        EndpointConfig(
+            task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[])
+        ).to_pipeline_config()
+
+
+def test_bucketing_one_sequence_length():
+    cfg = EndpointConfig(
+        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32])
+    ).to_pipeline_config()
+    assert cfg.input_shapes is None
+    assert cfg.kwargs == {"sequence_length": 32}
+
+
+def test_bucketing_multi_sequence_length():
+    cfg = EndpointConfig(
+        task="qa", model="", bucketing=SequenceLengthsConfig(sequence_lengths=[32, 64])
+    ).to_pipeline_config()
+    assert cfg.input_shapes is None
+    assert cfg.kwargs == {"sequence_length": [32, 64]}
+
+
+def test_bucketing_one_image_size():
+    cfg = EndpointConfig(
+        task="yolo", model="", bucketing=ImageSizesConfig(image_sizes=[(256, 256)])
+    ).to_pipeline_config()
+    assert cfg.input_shapes == [[256, 256]]
+    assert cfg.kwargs == {}
+
+
+def test_endpoint_config_to_pipeline_copy_fields():
+    cfg = EndpointConfig(task="qa", model="zxcv").to_pipeline_config()
+    assert cfg.task == "qa"
+    assert cfg.model_path == "zxcv"
+
+    cfg = EndpointConfig(task="", model="").to_pipeline_config()
+    assert cfg.batch_size == 1
+
+    cfg = EndpointConfig(task="", model="", batch_size=64).to_pipeline_config()
+    assert cfg.batch_size == 64
+
+
+def test_yaml_load_config(tmp_path):
+    server_config = ServerConfig(
+        num_cores=1,
+        num_workers=2,
+        integration="sagemaker",
+        endpoints=[
+            EndpointConfig(
+                name="asdf",
+                route="qwer",
+                task="uiop",
+                model="hjkl",
+                batch_size=1,
+                bucketing=None,
+            ),
+            EndpointConfig(
+                name="asdfd",
+                route="qwer",
+                task="uiop",
+                model="hjkl",
+                batch_size=2,
+                bucketing=ImageSizesConfig(image_sizes=[(1, 1), (2, 2)]),
+            ),
+            EndpointConfig(
+                name="asdfde",
+                route="qwer",
+                task="uiop",
+                model="hjkl",
+                batch_size=3,
+                bucketing=SequenceLengthsConfig(sequence_lengths=[5, 6, 7]),
+            ),
+        ],
+        loggers={},
+    )
+
+    path = tmp_path / "config.yaml"
+    with open(path, "w") as fp:
+        yaml.dump(server_config.dict(), fp)
+
+    with open(path) as fp:
+        obj = yaml.load(fp, Loader=yaml.Loader)
+    server_config2 = ServerConfig(**obj)
+    assert server_config == server_config2
+
+
+metric_function_config_yaml_1 = """
+  func: identity
+  frequency: 5
+  loggers:
+    - python"""
+
+metric_function_config_yaml_2 = """
+  func: numpy.max"""
+
+metric_function_config_yaml_3 = """
+  func: numpy.max
+  frequency: 0"""
+
+
+@pytest.mark.parametrize(
+    "config_yaml, should_fail, instance_type",
+    [
+        (metric_function_config_yaml_1, False, MetricFunctionConfig),
+        (metric_function_config_yaml_2, False, MetricFunctionConfig),
+        (
+            metric_function_config_yaml_3,
+            True,
+            MetricFunctionConfig,
+        ),  # frequency cannot be zero
+    ],
+)
+def test_function_logging_config(config_yaml, should_fail, instance_type):
+    obj = yaml.safe_load(config_yaml)
+    if should_fail:
+        with pytest.raises(Exception):
+            MetricFunctionConfig(**obj)
+    else:
+        assert MetricFunctionConfig(**obj)
+
+
+def _create_server_config(task_name, endpoint_1_name, endpoint_2_name):
+    return ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                name=endpoint_1_name,
+                task=task_name,
+                model="hjkl",
+            ),
+            EndpointConfig(
+                name=endpoint_2_name,
+                task=task_name,
+                model="hjkl",
+            ),
+        ]
+    )
+
+
+@pytest.mark.parametrize(
+    "task_name, endpoint_1_name, endpoint_2_name, raise_error, expected_endpoint_1_name, expected_endpoint_2_name",  # noqa: E501
+    [
+        ("some_task", None, None, False, "some_task-0", "some_task-1"),
+        ("some_task", "name_1", None, False, "name_1", "some_task-0"),
+        ("some_task", "name_1", "name_2", False, "name_1", "name_2"),
+        ("some_task", "name_1", "name_1", True, None, None),
+    ],
+)
+def test_unique_endpoint_names(
+    task_name,
+    endpoint_1_name,
+    endpoint_2_name,
+    raise_error,
+    expected_endpoint_1_name,
+    expected_endpoint_2_name,
+):
+    if raise_error:
+        with pytest.raises(ValueError):
+            _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+            return
+        return
+
+    server_config = _create_server_config(task_name, endpoint_1_name, endpoint_2_name)
+    assert server_config.endpoints[0].name == expected_endpoint_1_name
+    assert server_config.endpoints[1].name == expected_endpoint_2_name
diff --git a/tests/server/test_endpoints.py b/tests/server/test_endpoints.py
index 411fb46446..f028b37e75 100644
--- a/tests/server/test_endpoints.py
+++ b/tests/server/test_endpoints.py
@@ -1,268 +1,268 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# from typing import List
-# from unittest.mock import Mock
-#
-# from pydantic import BaseModel
-#
-# import pytest
-# from deepsparse.loggers import MultiLogger
-# from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
-# from deepsparse.server.server import _add_pipeline_endpoint, _build_app
-# from fastapi import FastAPI, UploadFile
-# from fastapi.testclient import TestClient
-# from tests.utils import mock_engine
-#
-#
-# class FromFilesSchema(BaseModel):
-#     def from_files(self, f):
-#         # do nothing - this method exists just to test files endpoint logic
-#         ...
-#
-#
-# class StrSchema(BaseModel):
-#     value: str
-#
-#
-# def parse(v: StrSchema) -> int:
-#     return int(v.value)
-#
-#
-# class TestStatusEndpoints:
-#     @pytest.fixture(scope="class")
-#     def server_config(self):
-#         server_config = ServerConfig(
-#             num_cores=1, num_workers=1, endpoints=[], loggers={}
-#         )
-#         yield server_config
-#
-#     @pytest.fixture(scope="class")
-#     def client(self, server_config):
-#         yield TestClient(_build_app(server_config))
-#
-#     def test_config(self, server_config, client):
-#         response = client.get("/config")
-#         loaded = ServerConfig(**response.json())
-#         assert loaded == server_config
-#
-#     @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
-#     def test_pings_exist(self, client, route):
-#         response = client.get(route)
-#         assert response.status_code == 200
-#         assert response.json() is True
-#
-#     def test_docs_exist(self, client):
-#         assert client.get("/docs").status_code == 200
-#
-#     def test_home_redirects_to_docs(self, client):
-#         response = client.get("/")
-#         assert response.status_code == 200
-#         assert response.request.path_url == "/docs"
-#         assert len(response.history) > 0
-#         assert response.history[-1].is_redirect
-#
-#
-# class TestMockEndpoints:
-#     @pytest.fixture(scope="class")
-#     def server_config(self):
-#         server_config = ServerConfig(
-#             num_cores=1, num_workers=1, endpoints=[], loggers={}
-#         )
-#         yield server_config
-#
-#     @pytest.fixture(scope="class")
-#     def app(self, server_config):
-#         yield _build_app(server_config)
-#
-#     @pytest.fixture(scope="class")
-#     def client(self, app):
-#         yield TestClient(app)
-#
-#     def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
-#         mock_pipeline = Mock(
-#             side_effect=parse,
-#             input_schema=StrSchema,
-#             output_schema=int,
-#             logger=MultiLogger([]),
-#         )
-#         _add_pipeline_endpoint(
-#             app,
-#             system_logging_config=SystemLoggingConfig(),
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             pipeline=mock_pipeline,
-#         )
-#         assert app.routes[-1].path == "/predict/parse_int"
-#         assert app.routes[-1].response_model is int
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-#         assert app.routes[-1].methods == {"POST"}
-#
-#         for v in ["1234", "5678"]:
-#             response = client.post("/predict/parse_int", json=dict(value=v))
-#             assert response.status_code == 200
-#             assert response.json() == int(v)
-#
-#     def test_add_model_endpoint_with_from_files(self, app):
-#         _add_pipeline_endpoint(
-#             app,
-#             system_logging_config=Mock(),
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-#         )
-#         assert app.routes[-2].path == "/predict/parse_int"
-#         assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
-#         assert app.routes[-1].path == "/predict/parse_int/from_files"
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-#         assert app.routes[-1].response_model is int
-#         assert app.routes[-1].methods == {"POST"}
-#
-#     def test_sagemaker_only_adds_one_endpoint(self, app):
-#         num_routes = len(app.routes)
-#         _add_pipeline_endpoint(
-#             app,
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             system_logging_config=Mock(),
-#             pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
-#             integration="sagemaker",
-#         )
-#         assert len(app.routes) == num_routes + 1
-#         assert app.routes[-1].path == "/invocations"
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
-#
-#         num_routes = len(app.routes)
-#         _add_pipeline_endpoint(
-#             app,
-#             endpoint_config=Mock(route="/predict/parse_int"),
-#             system_logging_config=Mock(),
-#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
-#             integration="sagemaker",
-#         )
-#         assert len(app.routes) == num_routes + 1
-#         assert app.routes[-1].path == "/invocations"
-#         assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
-#
-#     def test_add_endpoint_with_no_route_specified(self, app):
-#         _add_pipeline_endpoint(
-#             app,
-#             endpoint_config=Mock(route=None),
-#             system_logging_config=Mock(),
-#             pipeline=Mock(input_schema=StrSchema, output_schema=int),
-#         )
-#         assert app.routes[-1].path == "/predict"
-#
-#
-# class TestActualModelEndpoints:
-#     @pytest.fixture(scope="class")
-#     def client(self):
-#         stub = (
-#             "zoo:nlp/text_classification/distilbert-none/"
-#             "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
-#         )
-#         server_config = ServerConfig(
-#             num_cores=1,
-#             num_workers=1,
-#             endpoints=[
-#                 EndpointConfig(
-#                     route="/predict/dynamic-batch",
-#                     task="text-classification",
-#                     model=stub,
-#                     batch_size=1,
-#                 ),
-#                 EndpointConfig(
-#                     route="/predict/static-batch",
-#                     task="text-classification",
-#                     model=stub,
-#                     batch_size=2,
-#                 ),
-#             ],
-#             loggers={},  # do not instantiate any loggers
-#         )
-#         with mock_engine(rng_seed=0):
-#             app = _build_app(server_config)
-#         yield TestClient(app)
-#
-#     def test_static_batch_errors_on_wrong_batch_size(self, client):
-#         with pytest.raises(
-#             RuntimeError,
-#             match=(
-#                 "batch size of 1 passed into pipeline is "
-#                 "not divisible by model batch size of 2"
-#             ),
-#         ):
-#             client.post("/predict/static-batch", json={"sequences": "today is great"})
-#
-#     def test_static_batch_good_request(self, client):
-#         response = client.post(
-#             "/predict/static-batch",
-#             json={"sequences": ["today is great", "today is terrible"]},
-#         )
-#         assert response.status_code == 200
-#         output = response.json()
-#         assert len(output["labels"]) == 2
-#         assert len(output["scores"]) == 2
-#
-#     @pytest.mark.parametrize(
-#         "seqs",
-#         [
-#             ["today is great"],
-#             ["today is great", "today is terrible"],
-#             ["the first sentence", "the second sentence", "the third sentence"],
-#         ],
-#     )
-#     def test_dynamic_batch_any(self, client, seqs):
-#         response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
-#         assert response.status_code == 200
-#         output = response.json()
-#         assert len(output["labels"]) == len(seqs)
-#         assert len(output["scores"]) == len(seqs)
-#
-#
-# class TestDynamicEndpoints:
-#     @pytest.fixture(scope="class")
-#     def client(self):
-#         server_config = ServerConfig(
-#             num_cores=1, num_workers=1, endpoints=[], loggers=None
-#         )
-#         with mock_engine(rng_seed=0):
-#             app = _build_app(server_config)
-#             yield TestClient(app)
-#
-#
-# @mock_engine(rng_seed=0)
-# def test_dynamic_add_and_remove_endpoint(engine_mock):
-#     server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
-#     app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     # assert /predict doesn't exist
-#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
-#
-#     # add /predict
-#     response = client.post(
-#         "/endpoints",
-#         json=EndpointConfig(task="text-classification", model="default").dict(),
-#     )
-#     assert response.status_code == 200
-#     response = client.post("/predict", json=dict(sequences="asdf"))
-#     assert response.status_code == 200
-#
-#     # remove /predict
-#     response = client.delete(
-#         "/endpoints",
-#         json=EndpointConfig(
-#             route="/predict", task="text-classification", model="default"
-#         ).dict(),
-#     )
-#     assert response.status_code == 200
-#     assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+from unittest.mock import Mock
+
+from pydantic import BaseModel
+
+import pytest
+from deepsparse.loggers import MultiLogger
+from deepsparse.server.config import EndpointConfig, ServerConfig, SystemLoggingConfig
+from deepsparse.server.server import _add_pipeline_endpoint, _build_app
+from fastapi import FastAPI, UploadFile
+from fastapi.testclient import TestClient
+from tests.utils import mock_engine
+
+
+class FromFilesSchema(BaseModel):
+    def from_files(self, f):
+        # do nothing - this method exists just to test files endpoint logic
+        ...
+
+
+class StrSchema(BaseModel):
+    value: str
+
+
+def parse(v: StrSchema) -> int:
+    return int(v.value)
+
+
+class TestStatusEndpoints:
+    @pytest.fixture(scope="class")
+    def server_config(self):
+        server_config = ServerConfig(
+            num_cores=1, num_workers=1, endpoints=[], loggers={}
+        )
+        yield server_config
+
+    @pytest.fixture(scope="class")
+    def client(self, server_config):
+        yield TestClient(_build_app(server_config))
+
+    def test_config(self, server_config, client):
+        response = client.get("/config")
+        loaded = ServerConfig(**response.json())
+        assert loaded == server_config
+
+    @pytest.mark.parametrize("route", ["/ping", "/health", "/healthcheck", "/status"])
+    def test_pings_exist(self, client, route):
+        response = client.get(route)
+        assert response.status_code == 200
+        assert response.json() is True
+
+    def test_docs_exist(self, client):
+        assert client.get("/docs").status_code == 200
+
+    def test_home_redirects_to_docs(self, client):
+        response = client.get("/")
+        assert response.status_code == 200
+        assert response.request.path_url == "/docs"
+        assert len(response.history) > 0
+        assert response.history[-1].is_redirect
+
+
+class TestMockEndpoints:
+    @pytest.fixture(scope="class")
+    def server_config(self):
+        server_config = ServerConfig(
+            num_cores=1, num_workers=1, endpoints=[], loggers={}
+        )
+        yield server_config
+
+    @pytest.fixture(scope="class")
+    def app(self, server_config):
+        yield _build_app(server_config)
+
+    @pytest.fixture(scope="class")
+    def client(self, app):
+        yield TestClient(app)
+
+    def test_add_model_endpoint(self, app: FastAPI, client: TestClient):
+        mock_pipeline = Mock(
+            side_effect=parse,
+            input_schema=StrSchema,
+            output_schema=int,
+            logger=MultiLogger([]),
+        )
+        _add_pipeline_endpoint(
+            app,
+            system_logging_config=SystemLoggingConfig(),
+            endpoint_config=Mock(route="/predict/parse_int"),
+            pipeline=mock_pipeline,
+        )
+        assert app.routes[-1].path == "/predict/parse_int"
+        assert app.routes[-1].response_model is int
+        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+        assert app.routes[-1].methods == {"POST"}
+
+        for v in ["1234", "5678"]:
+            response = client.post("/predict/parse_int", json=dict(value=v))
+            assert response.status_code == 200
+            assert response.json() == int(v)
+
+    def test_add_model_endpoint_with_from_files(self, app):
+        _add_pipeline_endpoint(
+            app,
+            system_logging_config=Mock(),
+            endpoint_config=Mock(route="/predict/parse_int"),
+            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+        )
+        assert app.routes[-2].path == "/predict/parse_int"
+        assert app.routes[-2].endpoint.__annotations__ == {"request": FromFilesSchema}
+        assert app.routes[-1].path == "/predict/parse_int/from_files"
+        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+        assert app.routes[-1].response_model is int
+        assert app.routes[-1].methods == {"POST"}
+
+    def test_sagemaker_only_adds_one_endpoint(self, app):
+        num_routes = len(app.routes)
+        _add_pipeline_endpoint(
+            app,
+            endpoint_config=Mock(route="/predict/parse_int"),
+            system_logging_config=Mock(),
+            pipeline=Mock(input_schema=FromFilesSchema, output_schema=int),
+            integration="sagemaker",
+        )
+        assert len(app.routes) == num_routes + 1
+        assert app.routes[-1].path == "/invocations"
+        assert app.routes[-1].endpoint.__annotations__ == {"request": List[UploadFile]}
+
+        num_routes = len(app.routes)
+        _add_pipeline_endpoint(
+            app,
+            endpoint_config=Mock(route="/predict/parse_int"),
+            system_logging_config=Mock(),
+            pipeline=Mock(input_schema=StrSchema, output_schema=int),
+            integration="sagemaker",
+        )
+        assert len(app.routes) == num_routes + 1
+        assert app.routes[-1].path == "/invocations"
+        assert app.routes[-1].endpoint.__annotations__ == {"request": StrSchema}
+
+    def test_add_endpoint_with_no_route_specified(self, app):
+        _add_pipeline_endpoint(
+            app,
+            endpoint_config=Mock(route=None),
+            system_logging_config=Mock(),
+            pipeline=Mock(input_schema=StrSchema, output_schema=int),
+        )
+        assert app.routes[-1].path == "/predict"
+
+
+class TestActualModelEndpoints:
+    @pytest.fixture(scope="class")
+    def client(self):
+        stub = (
+            "zoo:nlp/text_classification/distilbert-none/"
+            "pytorch/huggingface/qqp/pruned80_quant-none-vnni"
+        )
+        server_config = ServerConfig(
+            num_cores=1,
+            num_workers=1,
+            endpoints=[
+                EndpointConfig(
+                    route="/predict/dynamic-batch",
+                    task="text-classification",
+                    model=stub,
+                    batch_size=1,
+                ),
+                EndpointConfig(
+                    route="/predict/static-batch",
+                    task="text-classification",
+                    model=stub,
+                    batch_size=2,
+                ),
+            ],
+            loggers={},  # do not instantiate any loggers
+        )
+        with mock_engine(rng_seed=0):
+            app = _build_app(server_config)
+        yield TestClient(app)
+
+    def test_static_batch_errors_on_wrong_batch_size(self, client):
+        with pytest.raises(
+            RuntimeError,
+            match=(
+                "batch size of 1 passed into pipeline is "
+                "not divisible by model batch size of 2"
+            ),
+        ):
+            client.post("/predict/static-batch", json={"sequences": "today is great"})
+
+    def test_static_batch_good_request(self, client):
+        response = client.post(
+            "/predict/static-batch",
+            json={"sequences": ["today is great", "today is terrible"]},
+        )
+        assert response.status_code == 200
+        output = response.json()
+        assert len(output["labels"]) == 2
+        assert len(output["scores"]) == 2
+
+    @pytest.mark.parametrize(
+        "seqs",
+        [
+            ["today is great"],
+            ["today is great", "today is terrible"],
+            ["the first sentence", "the second sentence", "the third sentence"],
+        ],
+    )
+    def test_dynamic_batch_any(self, client, seqs):
+        response = client.post("/predict/dynamic-batch", json={"sequences": seqs})
+        assert response.status_code == 200
+        output = response.json()
+        assert len(output["labels"]) == len(seqs)
+        assert len(output["scores"]) == len(seqs)
+
+
+class TestDynamicEndpoints:
+    @pytest.fixture(scope="class")
+    def client(self):
+        server_config = ServerConfig(
+            num_cores=1, num_workers=1, endpoints=[], loggers=None
+        )
+        with mock_engine(rng_seed=0):
+            app = _build_app(server_config)
+            yield TestClient(app)
+
+
+@mock_engine(rng_seed=0)
+def test_dynamic_add_and_remove_endpoint(engine_mock):
+    server_config = ServerConfig(num_cores=1, num_workers=1, endpoints=[], loggers={})
+    app = _build_app(server_config)
+    client = TestClient(app)
+
+    # assert /predict doesn't exist
+    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
+
+    # add /predict
+    response = client.post(
+        "/endpoints",
+        json=EndpointConfig(task="text-classification", model="default").dict(),
+    )
+    assert response.status_code == 200
+    response = client.post("/predict", json=dict(sequences="asdf"))
+    assert response.status_code == 200
+
+    # remove /predict
+    response = client.delete(
+        "/endpoints",
+        json=EndpointConfig(
+            route="/predict", task="text-classification", model="default"
+        ).dict(),
+    )
+    assert response.status_code == 200
+    assert 404 == client.post("/predict", json=dict(sequences="asdf")).status_code
diff --git a/tests/server/test_loggers.py b/tests/server/test_loggers.py
index 8802835381..ce2576c09f 100644
--- a/tests/server/test_loggers.py
+++ b/tests/server/test_loggers.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+
 import shutil
 from collections import Counter
 from unittest import mock
@@ -58,247 +58,247 @@ def test_default_logger():
         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
     ), mock_engine(rng_seed=0):
         app = _build_app(server_config)
-    # client = TestClient(app)
-    #
-    # for _ in range(2):
-    #     client.post("/predict", json={"sequences": "today is great"})
-    # assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
+    client = TestClient(app)
 
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    assert isinstance(fetch_leaf_logger(server_logger), PythonLogger)
 
-# def test_data_logging_from_predefined():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name="text_classification",
-#                 model=stub,
-#                 add_predefined=[MetricFunctionConfig(func="text_classification")],
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#     client.post(
-#         "/predict",
-#         json={
-#             "sequences": [["Fun for adults and children.", "Fun for only children."]]
-#         },
-#     )
-#     calls = fetch_leaf_logger(server_logger).calls
-#     data_logging_logs = [call for call in calls if "DATA" in call]
-#     with open(
-#         "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
-#         "r",
-#     ) as f:
-#         expected_logs = f.read().splitlines()
-#     for log, expected_log in zip(data_logging_logs, expected_logs):
-#         assert log == expected_log
-#
-#
-# @flaky(max_runs=4, min_passes=3)
-# def test_logging_only_system_info():
-#     server_config = ServerConfig(
-#         endpoints=[EndpointConfig(task=task, name=name, model=stub)],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#     _test_logger_contents(
-#         fetch_leaf_logger(server_logger),
-#         {"prediction_latency": 8},
-#     )
-#
-#
-# def test_regex_target_logging():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name=name,
-#                 data_logging={
-#                     "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
-#                 },
-#                 model=stub,
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#     _test_logger_contents(
-#         fetch_leaf_logger(server_logger),
-#         {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
-#     )
-#
-#
-# def test_multiple_targets_logging():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name=name,
-#                 data_logging={
-#                     "pipeline_inputs.sequences": [
-#                         MetricFunctionConfig(func="identity")
-#                     ],
-#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
-#                 },
-#                 model=stub,
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#     _test_logger_contents(
-#         fetch_leaf_logger(server_logger),
-#         {
-#             "pipeline_inputs.sequences__identity": 2,
-#             "engine_inputs__identity": 2,
-#             "prediction_latency": 8,
-#         },
-#     )
-#
-#
-# @flaky(max_runs=3, min_passes=2)
-# def test_function_metric_with_target_loggers():
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task,
-#                 name=name,
-#                 data_logging={
-#                     "pipeline_inputs.sequences[0]": [
-#                         MetricFunctionConfig(
-#                             func="identity", target_loggers=["logger_1"]
-#                         )
-#                     ],
-#                     "engine_inputs": [MetricFunctionConfig(func="identity")],
-#                 },
-#                 model=stub,
-#             )
-#         ],
-#         loggers={
-#             "logger_1": {"path": logger_identifier},
-#             "logger_2": {"path": logger_identifier},
-#         },
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#
-#     for _ in range(2):
-#         client.post("/predict", json={"sequences": "today is great"})
-#
-#     _test_logger_contents(
-#         server_logger.logger.loggers[1].logger.loggers[0],
-#         {
-#             "pipeline_inputs.sequences__identity": 2,
-#             "engine_inputs__identity": 2,
-#             "prediction_latency": 8,
-#         },
-#     )
-#     _test_logger_contents(
-#         server_logger.logger.loggers[1].logger.loggers[1],
-#         {
-#             "pipeline_inputs.sequences__identity": 0,
-#             "engine_inputs__identity": 2,
-#             "prediction_latency": 8,
-#         },
-#     )
-#
-#
-# @mock_engine(rng_seed=0)
-# def test_instantiate_prometheus(mock_engine, tmp_path):
-#     client = TestClient(
-#         _build_app(
-#             ServerConfig(
-#                 endpoints=[EndpointConfig(task="text_classification", model="default")],
-#                 loggers=dict(
-#                     prometheus={
-#                         "port": find_free_port(),
-#                         "text_log_save_dir": tmp_path.name,
-#                         "text_log_save_frequency": 30,
-#                     }
-#                 ),
-#             )
-#         )
-#     )
-#     r = client.post("/predict", json=dict(sequences="asdf"))
-#     assert r.status_code == 200
-#     shutil.rmtree(tmp_path.name, ignore_errors=True)
-#
-#
-# @mock_engine(rng_seed=0)
-# def test_endpoint_system_logging(mock_engine):
-#     server_config = ServerConfig(
-#         system_logging=ServerSystemLoggingConfig(
-#             request_details=SystemLoggingGroup(enable=True),
-#             resource_utilization=SystemLoggingGroup(enable=True),
-#         ),
-#         endpoints=[
-#             EndpointConfig(
-#                 task="text_classification",
-#                 model="default",
-#                 route="/predict_text_classification",
-#                 logging_config=PipelineSystemLoggingConfig(
-#                     inference_details=SystemLoggingGroup(enable=True),
-#                     prediction_latency=SystemLoggingGroup(enable=True),
-#                 ),
-#             ),
-#             EndpointConfig(
-#                 task="question_answering",
-#                 model="default",
-#                 route="/predict_question_answering",
-#                 logging_config=PipelineSystemLoggingConfig(
-#                     inference_details=SystemLoggingGroup(enable=True),
-#                     prediction_latency=SystemLoggingGroup(enable=True),
-#                 ),
-#             ),
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine:
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#     client.post("/predict_text_classification", json=dict(sequences="asdf"))
-#     client.post(
-#         "/predict_text_classification", json=dict(question="asdf", context="asdf")
-#     )
-#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-#
-#     c = Counter([call.split(",")[0] for call in calls])
-#
-#     assert c == SAMPLE_LOGS_DICT
+
+def test_data_logging_from_predefined():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name="text_classification",
+                model=stub,
+                add_predefined=[MetricFunctionConfig(func="text_classification")],
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+    client.post(
+        "/predict",
+        json={
+            "sequences": [["Fun for adults and children.", "Fun for only children."]]
+        },
+    )
+    calls = fetch_leaf_logger(server_logger).calls
+    data_logging_logs = [call for call in calls if "DATA" in call]
+    with open(
+        "tests/deepsparse/loggers/metric_functions/predefined/predefined_logs/text_classification.txt",  # noqa E501
+        "r",
+    ) as f:
+        expected_logs = f.read().splitlines()
+    for log, expected_log in zip(data_logging_logs, expected_logs):
+        assert log == expected_log
+
+
+@flaky(max_runs=4, min_passes=3)
+def test_logging_only_system_info():
+    server_config = ServerConfig(
+        endpoints=[EndpointConfig(task=task, name=name, model=stub)],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    _test_logger_contents(
+        fetch_leaf_logger(server_logger),
+        {"prediction_latency": 8},
+    )
+
+
+def test_regex_target_logging():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name=name,
+                data_logging={
+                    "re:.*pipeline*.": [MetricFunctionConfig(func="identity")]
+                },
+                model=stub,
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    _test_logger_contents(
+        fetch_leaf_logger(server_logger),
+        {"pipeline_inputs__identity": 2, "pipeline_outputs__identity": 2},
+    )
+
+
+def test_multiple_targets_logging():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name=name,
+                data_logging={
+                    "pipeline_inputs.sequences": [
+                        MetricFunctionConfig(func="identity")
+                    ],
+                    "engine_inputs": [MetricFunctionConfig(func="identity")],
+                },
+                model=stub,
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+    _test_logger_contents(
+        fetch_leaf_logger(server_logger),
+        {
+            "pipeline_inputs.sequences__identity": 2,
+            "engine_inputs__identity": 2,
+            "prediction_latency": 8,
+        },
+    )
+
+
+@flaky(max_runs=3, min_passes=2)
+def test_function_metric_with_target_loggers():
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task,
+                name=name,
+                data_logging={
+                    "pipeline_inputs.sequences[0]": [
+                        MetricFunctionConfig(
+                            func="identity", target_loggers=["logger_1"]
+                        )
+                    ],
+                    "engine_inputs": [MetricFunctionConfig(func="identity")],
+                },
+                model=stub,
+            )
+        ],
+        loggers={
+            "logger_1": {"path": logger_identifier},
+            "logger_2": {"path": logger_identifier},
+        },
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+
+    for _ in range(2):
+        client.post("/predict", json={"sequences": "today is great"})
+
+    _test_logger_contents(
+        server_logger.logger.loggers[1].logger.loggers[0],
+        {
+            "pipeline_inputs.sequences__identity": 2,
+            "engine_inputs__identity": 2,
+            "prediction_latency": 8,
+        },
+    )
+    _test_logger_contents(
+        server_logger.logger.loggers[1].logger.loggers[1],
+        {
+            "pipeline_inputs.sequences__identity": 0,
+            "engine_inputs__identity": 2,
+            "prediction_latency": 8,
+        },
+    )
+
+
+@mock_engine(rng_seed=0)
+def test_instantiate_prometheus(mock_engine, tmp_path):
+    client = TestClient(
+        _build_app(
+            ServerConfig(
+                endpoints=[EndpointConfig(task="text_classification", model="default")],
+                loggers=dict(
+                    prometheus={
+                        "port": find_free_port(),
+                        "text_log_save_dir": tmp_path.name,
+                        "text_log_save_frequency": 30,
+                    }
+                ),
+            )
+        )
+    )
+    r = client.post("/predict", json=dict(sequences="asdf"))
+    assert r.status_code == 200
+    shutil.rmtree(tmp_path.name, ignore_errors=True)
+
+
+@mock_engine(rng_seed=0)
+def test_endpoint_system_logging(mock_engine):
+    server_config = ServerConfig(
+        system_logging=ServerSystemLoggingConfig(
+            request_details=SystemLoggingGroup(enable=True),
+            resource_utilization=SystemLoggingGroup(enable=True),
+        ),
+        endpoints=[
+            EndpointConfig(
+                task="text_classification",
+                model="default",
+                route="/predict_text_classification",
+                logging_config=PipelineSystemLoggingConfig(
+                    inference_details=SystemLoggingGroup(enable=True),
+                    prediction_latency=SystemLoggingGroup(enable=True),
+                ),
+            ),
+            EndpointConfig(
+                task="question_answering",
+                model="default",
+                route="/predict_question_answering",
+                logging_config=PipelineSystemLoggingConfig(
+                    inference_details=SystemLoggingGroup(enable=True),
+                    prediction_latency=SystemLoggingGroup(enable=True),
+                ),
+            ),
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine:
+        app = _build_app(server_config)
+    client = TestClient(app)
+    client.post("/predict_text_classification", json=dict(sequences="asdf"))
+    client.post(
+        "/predict_text_classification", json=dict(question="asdf", context="asdf")
+    )
+    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+
+    c = Counter([call.split(",")[0] for call in calls])
+
+    assert c == SAMPLE_LOGS_DICT
diff --git a/tests/server/test_system_logging.py b/tests/server/test_system_logging.py
index bd0a8a3ae3..b6a3fbd2b6 100644
--- a/tests/server/test_system_logging.py
+++ b/tests/server/test_system_logging.py
@@ -1,169 +1,169 @@
-# # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #    http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing,
-# # software distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-#
-# from unittest import mock
-#
-# import pytest
-# from deepsparse.loggers.config import SystemLoggingGroup
-# from deepsparse.server.config import (
-#     EndpointConfig,
-#     ServerConfig,
-#     ServerSystemLoggingConfig,
-# )
-# from deepsparse.server.helpers import server_logger_from_config
-# from deepsparse.server.server import _build_app
-# from deepsparse.server.system_logging import log_resource_utilization
-# from fastapi.testclient import TestClient
-# from tests.deepsparse.loggers.helpers import ListLogger
-# from tests.utils import mock_engine
-#
-#
-# logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
-# stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
-# task = "text-classification"
-# name = "endpoint_name"
-#
-#
-# def _test_successful_requests(calls, successful_request):
-#     relevant_call = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:request_details/successful_request_count")
-#     ]
-#     assert len(relevant_call) == 1
-#     relevant_call = relevant_call[0]
-#     value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
-#     assert value == successful_request
-#
-#
-# def _test_response_msg(calls, response_msg):
-#     relevant_call = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:request_details/response_message")
-#     ]
-#     assert len(relevant_call) == 1
-#     relevant_call = relevant_call[0]
-#     value = relevant_call.split("value:")[1].split(",")[0]
-#     assert value == response_msg
-#
-#
-# @pytest.mark.parametrize(
-#     "json_payload, input_batch_size, successful_request, response_msg",
-#     [
-#         ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
-#         (
-#             {"sequences": ["today is great", "today is great"]},
-#             2,
-#             True,
-#             "Response status code: 200",
-#         ),
-#         ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
-#     ],
-# )
-# def test_log_request_details(
-#     json_payload, input_batch_size, successful_request, response_msg
-# ):
-#     server_config = ServerConfig(
-#         endpoints=[
-#             EndpointConfig(
-#                 task=task, name=name, model=stub, batch_size=input_batch_size
-#             )
-#         ],
-#         loggers={"logger_1": {"path": logger_identifier}},
-#         system_logging=ServerSystemLoggingConfig(
-#             request_details=SystemLoggingGroup(enable=True)
-#         ),
-#     )
-#     server_logger = server_logger_from_config(server_config)
-#     with mock.patch(
-#         "deepsparse.server.server.server_logger_from_config", return_value=server_logger
-#     ), mock_engine(rng_seed=0):
-#         app = _build_app(server_config)
-#     client = TestClient(app)
-#     client.post("/predict", json=json_payload)
-#
-#     calls = server_logger.logger.loggers[0].logger.loggers[0].calls
-#
-#     _test_successful_requests(calls, successful_request)
-#     _test_response_msg(calls, response_msg)
-#
-#
-# def _test_cpu_utilization(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
-#     ]
-#     assert len(relevant_calls) == num_iterations
-#
-#
-# def _test_memory_utilization(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:resource_utilization/memory_utilization_percent")
-#     ]
-#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-#     assert len(relevant_calls) == num_iterations
-#     # memory utilization is a percentage, so it should be between 0 and 100
-#     assert all(0.0 < value < 100.0 for value in values)
-#
-#
-# def _test_total_memory_available(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith(
-#             "identifier:resource_utilization/total_memory_available_bytes"
-#         )
-#     ]
-#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-#     assert len(relevant_calls) == num_iterations
-#     # assert all values are the same (total memory available is constant)
-#     assert all(value == values[0] for value in values)
-#
-#
-# def _test_additional_items_to_log(calls, num_iterations):
-#     relevant_calls = [
-#         call
-#         for call in calls
-#         if call.startswith("identifier:resource_utilization/test")
-#     ]
-#     values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
-#     assert len(relevant_calls) == num_iterations
-#     # assert all values are the same ({"test" : 1} is constant)
-#     assert all(value == 1 for value in values)
-#
-#
-# @pytest.mark.parametrize(
-#     "num_iterations, additional_items_to_log",
-#     [
-#         (5, {}),
-#         (3, {"test": 1}),
-#     ],
-# )
-# def test_log_resource_utilization(num_iterations, additional_items_to_log):
-#     server_logger = ListLogger()
-#
-#     for iter in range(num_iterations):
-#         log_resource_utilization(
-#             server_logger, prefix="resource_utilization", **additional_items_to_log
-#         )
-#
-#     calls = server_logger.calls
-#
-#     _test_cpu_utilization(calls, num_iterations)
-#     _test_memory_utilization(calls, num_iterations)
-#     _test_total_memory_available(calls, num_iterations)
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from unittest import mock
+
+import pytest
+from deepsparse.loggers.config import SystemLoggingGroup
+from deepsparse.server.config import (
+    EndpointConfig,
+    ServerConfig,
+    ServerSystemLoggingConfig,
+)
+from deepsparse.server.helpers import server_logger_from_config
+from deepsparse.server.server import _build_app
+from deepsparse.server.system_logging import log_resource_utilization
+from fastapi.testclient import TestClient
+from tests.deepsparse.loggers.helpers import ListLogger
+from tests.utils import mock_engine
+
+
+logger_identifier = "tests/deepsparse/loggers/helpers.py:ListLogger"
+stub = "zoo:nlp/text_classification/distilbert-none/pytorch/huggingface/qqp/pruned80_quant-none-vnni"  # noqa E501
+task = "text-classification"
+name = "endpoint_name"
+
+
+def _test_successful_requests(calls, successful_request):
+    relevant_call = [
+        call
+        for call in calls
+        if call.startswith("identifier:request_details/successful_request_count")
+    ]
+    assert len(relevant_call) == 1
+    relevant_call = relevant_call[0]
+    value = bool(int(relevant_call.split("value:")[1].split(",")[0]))
+    assert value == successful_request
+
+
+def _test_response_msg(calls, response_msg):
+    relevant_call = [
+        call
+        for call in calls
+        if call.startswith("identifier:request_details/response_message")
+    ]
+    assert len(relevant_call) == 1
+    relevant_call = relevant_call[0]
+    value = relevant_call.split("value:")[1].split(",")[0]
+    assert value == response_msg
+
+
+@pytest.mark.parametrize(
+    "json_payload, input_batch_size, successful_request, response_msg",
+    [
+        ({"sequences": "today is great"}, 1, True, "Response status code: 200"),
+        (
+            {"sequences": ["today is great", "today is great"]},
+            2,
+            True,
+            "Response status code: 200",
+        ),
+        ({"this": "is supposed to fail"}, 1, False, "Response status code: 422"),
+    ],
+)
+def test_log_request_details(
+    json_payload, input_batch_size, successful_request, response_msg
+):
+    server_config = ServerConfig(
+        endpoints=[
+            EndpointConfig(
+                task=task, name=name, model=stub, batch_size=input_batch_size
+            )
+        ],
+        loggers={"logger_1": {"path": logger_identifier}},
+        system_logging=ServerSystemLoggingConfig(
+            request_details=SystemLoggingGroup(enable=True)
+        ),
+    )
+    server_logger = server_logger_from_config(server_config)
+    with mock.patch(
+        "deepsparse.server.server.server_logger_from_config", return_value=server_logger
+    ), mock_engine(rng_seed=0):
+        app = _build_app(server_config)
+    client = TestClient(app)
+    client.post("/predict", json=json_payload)
+
+    calls = server_logger.logger.loggers[0].logger.loggers[0].calls
+
+    _test_successful_requests(calls, successful_request)
+    _test_response_msg(calls, response_msg)
+
+
+def _test_cpu_utilization(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith("identifier:resource_utilization/cpu_utilization_percent")
+    ]
+    assert len(relevant_calls) == num_iterations
+
+
+def _test_memory_utilization(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith("identifier:resource_utilization/memory_utilization_percent")
+    ]
+    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+    assert len(relevant_calls) == num_iterations
+    # memory utilization is a percentage, so it should be between 0 and 100
+    assert all(0.0 < value < 100.0 for value in values)
+
+
+def _test_total_memory_available(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith(
+            "identifier:resource_utilization/total_memory_available_bytes"
+        )
+    ]
+    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+    assert len(relevant_calls) == num_iterations
+    # assert all values are the same (total memory available is constant)
+    assert all(value == values[0] for value in values)
+
+
+def _test_additional_items_to_log(calls, num_iterations):
+    relevant_calls = [
+        call
+        for call in calls
+        if call.startswith("identifier:resource_utilization/test")
+    ]
+    values = [float(call.split("value:")[1].split(",")[0]) for call in relevant_calls]
+    assert len(relevant_calls) == num_iterations
+    # assert all values are the same ({"test" : 1} is constant)
+    assert all(value == 1 for value in values)
+
+
+@pytest.mark.parametrize(
+    "num_iterations, additional_items_to_log",
+    [
+        (5, {}),
+        (3, {"test": 1}),
+    ],
+)
+def test_log_resource_utilization(num_iterations, additional_items_to_log):
+    server_logger = ListLogger()
+
+    for iter in range(num_iterations):
+        log_resource_utilization(
+            server_logger, prefix="resource_utilization", **additional_items_to_log
+        )
+
+    calls = server_logger.calls
+
+    _test_cpu_utilization(calls, num_iterations)
+    _test_memory_utilization(calls, num_iterations)
+    _test_total_memory_available(calls, num_iterations)

From 4ad5f49d4845baa19156c6ea499c854f52c5cc21 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 13 Jun 2023 05:34:42 +0000
Subject: [PATCH 44/68] fix the erronous Makefile

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 09082e8c38..50476f2e7c 100644
--- a/Makefile
+++ b/Makefile
@@ -53,12 +53,12 @@ artifacts:
 # run tests for the repo
 test:
 	@echo "Running python tests";
-	@SPARSEZOO_TEST_MODE="true" @NM_DISABLE_ANALYTICS="true" pytest tests/ --ignore integrations $(PYTEST_ARGS);
+	@SPARSEZOO_TEST_MODE="true" NM_DISABLE_ANALYTICS="true" pytest tests/ --ignore integrations $(PYTEST_ARGS);
 
 # run integrations tests for the repo
 test_integrations:
 	@echo "Running package integrations tests";
-	@SPARSEZOO_TEST_MODE="true" @NM_DISABLE_ANALYTICS="true" pytest integrations/ --ignore tests $(PYTEST_ARGS);
+	@SPARSEZOO_TEST_MODE="true" NM_DISABLE_ANALYTICS="true" pytest integrations/ --ignore tests $(PYTEST_ARGS);
 
 # create docs
 docs:

From f97467f36b20a16f3cf79c41509e9c94bd4b3686 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 13 Jun 2023 06:23:03 +0000
Subject: [PATCH 45/68] perhaps fixed GHA

---
 src/deepsparse/utils/onnx.py  | 3 ++-
 tests/utils/engine_mocking.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index eb31179bc9..5ce98ce6a0 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -198,6 +198,7 @@ def generate_random_inputs(
     return input_data_list
 
 
+@contextlib.contextmanager
 def override_onnx_batch_size(
     onnx_filepath: str,
     batch_size: int,
@@ -233,7 +234,7 @@ def override_onnx_batch_size(
             f"Overwriting in-place the batch size of the model at {onnx_filepath}"
         )
         save_onnx(model, onnx_filepath)
-        return onnx_filepath
+        yield onnx_filepath
     else:
         return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
diff --git a/tests/utils/engine_mocking.py b/tests/utils/engine_mocking.py
index 4a83c42c00..978971c63d 100644
--- a/tests/utils/engine_mocking.py
+++ b/tests/utils/engine_mocking.py
@@ -99,7 +99,9 @@ def __init__(
         # Assumes the first dimension is batch dimension!!
         # However in general we cannot assume that all outputs have
         # a batch dimension, that's why we need onnxruntime here.
-        with override_onnx_batch_size(model_path, batch_size) as batched_model_path:
+        with override_onnx_batch_size(
+            model_path, batch_size, inplace=True
+        ) as batched_model_path:
             session = ort.InferenceSession(batched_model_path)
             self.input_descriptors = list(map(_to_descriptor, session.get_inputs()))
             self.output_descriptors = list(map(_to_descriptor, session.get_outputs()))

From 6be8d87f3496ef2d2898c998d7dfa346bc694dcb Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 13 Jun 2023 06:42:56 +0000
Subject: [PATCH 46/68] take into consideration that GHA creates four files

---
 tests/conftest.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 62f781f043..7be5656806 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -69,7 +69,12 @@ def check_for_created_files():
     end_files_root = _get_files(directory=r".")
     end_files_temp = _get_files(directory=tempfile.gettempdir())
 
-    assert len(start_files_root) >= len(end_files_root), (
+    max_allowed_number_created_files = 4
+    # GHA needs to create following files:
+    # pyproject.toml, CONTRIBUTING.md, LICENSE, setup.cfg
+    assert len(start_files_root) + max_allowed_number_created_files >= len(
+        end_files_root
+    ), (
         f"{len(end_files_root) - len(start_files_root)} "
         f"files created in current working "
         f"directory during pytest run. "

From e2f088dd10e6591b58314cd6b5ff603ad94ba700 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 13 Jun 2023 11:22:55 +0000
Subject: [PATCH 47/68] initial commit

---
 src/deepsparse/benchmark/ort_engine.py        |  34 +-
 src/deepsparse/engine.py                      |  87 ++-
 src/deepsparse/pipeline.py                    |  33 +-
 src/deepsparse/pipelines/text_generation.py   | 542 ++++++++++++++++++
 src/deepsparse/tasks.py                       |   3 +-
 src/deepsparse/timing/pipeline_timer.py       | 193 -------
 .../transformers/pipelines/text_generation.py |  40 +-
 src/deepsparse/utils/onnx.py                  |  73 ++-
 8 files changed, 700 insertions(+), 305 deletions(-)
 create mode 100644 src/deepsparse/pipelines/text_generation.py
 delete mode 100644 src/deepsparse/timing/pipeline_timer.py

diff --git a/src/deepsparse/benchmark/ort_engine.py b/src/deepsparse/benchmark/ort_engine.py
index e9f16c1e0f..4a340d326b 100644
--- a/src/deepsparse/benchmark/ort_engine.py
+++ b/src/deepsparse/benchmark/ort_engine.py
@@ -62,8 +62,8 @@ def _validate_ort_import():
         )
 
 
-def _validate_batch_size(batch_size: int) -> Optional[int]:
-    if batch_size is not None and batch_size < 1:
+def _validate_batch_size(batch_size: int) -> int:
+    if batch_size < 1:
         raise ValueError("batch_size must be greater than 0")
 
     return batch_size
@@ -116,15 +116,29 @@ def __init__(
                     f" num_cores={num_cores}, please specify CPUExecutionProvider"
                 )
 
+        # TODO (michael): Unfortunately we are stacking overrides here, this can be
+        # cleaned up once we pass the loaded ONNX around and not paths
         if self._input_shapes:
-            override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
-        if self._batch_size is not None:
-            override_onnx_batch_size(self._model_path, self._batch_size, inplace=True)
-        self._eng_net = onnxruntime.InferenceSession(
-            self._model_path,
-            sess_options,
-            providers=providers,
-        )
+            with override_onnx_input_shapes(
+                self._model_path, self._input_shapes, inplace=True
+            ) as input_override_model_path:
+                with override_onnx_batch_size(
+                    input_override_model_path, batch_size, inplace=True
+                ) as batch_override_model_path:
+                    self._eng_net = onnxruntime.InferenceSession(
+                        batch_override_model_path,
+                        sess_options,
+                        providers=providers,
+                    )
+        else:
+            with override_onnx_batch_size(
+                self._model_path, batch_size, inplace=True
+            ) as batch_override_model_path:
+                self._eng_net = onnxruntime.InferenceSession(
+                    batch_override_model_path,
+                    sess_options,
+                    providers=providers,
+                )
 
     def __call__(
         self,
diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index b7641287b2..b8da033d25 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -276,23 +276,32 @@ def __init__(
         num_streams: int = None,
         scheduler: Scheduler = None,
         input_shapes: List[List[int]] = None,
-        cache_inputs: List[bool] = None,
     ):
         BaseEngine.construct(
             self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
         )
-        num_streams = _validate_num_streams(num_streams, self._num_cores)
-        if self._input_shapes is not None:
-            override_onnx_input_shapes(self._model_path, self._input_shapes, inplace=True)
-        self._eng_net = LIB.deepsparse_engine(
-            self._model_path,
-            self._batch_size,
-            self._num_cores,
-            num_streams,
-            self._scheduler.value,
-            None,
-            cache_inputs,
-        )
+
+        if self._input_shapes:
+            with override_onnx_input_shapes(
+                self._model_path, self._input_shapes
+            ) as model_path:
+                self._eng_net = LIB.deepsparse_engine(
+                    model_path,
+                    self._batch_size,
+                    self._num_cores,
+                    self._num_streams,
+                    self._scheduler.value,
+                    None,
+                )
+        else:
+            self._eng_net = LIB.deepsparse_engine(
+                self._model_path,
+                self._batch_size,
+                self._num_cores,
+                self._num_streams,
+                self._scheduler.value,
+                None,
+            )
 
     def __call__(
         self,
@@ -727,7 +736,7 @@ def __init__(
 
         if self._input_shapes:
             with override_onnx_input_shapes(
-                self._model_path, self._input_shapes
+                self._model_path, self._input_shapes, inplace=True
             ) as model_path:
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
@@ -815,7 +824,9 @@ def __init__(
 
         if self._input_shapes:
             with override_onnx_input_shapes(
-                self._model_path, self._input_shapes
+                self._model_path,
+                self._input_shapes,
+                inplace=True,
             ) as model_path:
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
@@ -836,6 +847,52 @@ def __init__(
             )
 
 
+class KVCacheEngine(Engine):
+    """
+    Engine that can do kv caching.
+    """
+
+    def __init__(
+        self,
+        model: Union[str, "Model", "File"],
+        batch_size: int = 1,
+        num_cores: int = None,
+        num_streams: int = None,
+        scheduler: Scheduler = None,
+        input_shapes: List[List[int]] = None,
+        kv_cache_bools: List[bool] = None,
+        prev_cache_length: int = 0,
+    ):
+        BaseEngine.construct(
+            self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
+        )
+
+        if kv_cache_bools is None:
+            # If no list was provided, then we assume all outputs except for the first are KV caches
+            # Note: In the future we can look at the names of outputs to be more sure
+            #
+            # Create a boolean list of every output of the model
+            output_names = get_output_names(self._model_path)
+            kv_cache_bools = [True for i in range(len(output_names))]
+            # Assume first input is logits and logits ought not to be cached
+            kv_cache_bools[0] = False
+
+        num_streams = _validate_num_streams(num_streams, self._num_cores)
+        if self._input_shapes:
+            raise NotImplementedError("Don't do this yet :)")
+        else:
+            self._eng_net = LIB.deepsparse_engine(
+                self._model_path,
+                self._batch_size,
+                self._num_cores,
+                num_streams,
+                self._scheduler.value,
+                None,
+                kv_cache_bools,
+                prev_cache_length,
+            )
+
+
 def compile_model(
     model: Union[str, "Model", "File"],
     batch_size: int = 1,
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 75abb42939..3fac78592d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -37,7 +37,6 @@
 )
 from deepsparse.tasks import SupportedTasks, dynamic_import_task
 from deepsparse.timing import InferencePhases, Timer
-from deepsparse.timing.pipeline_timer import PipelineTimer
 
 
 __all__ = [
@@ -142,7 +141,6 @@ class PipelineImplementation(Pipeline):
         a path to the logging config, or yaml string representation the logging
         config. If logger provided (in any form), the pipeline will log inference
         metrics to the logger. Default is None
-    :param benchmark: An optional boolean flag that can be used to enable/disable
     """
 
     def __init__(
@@ -157,12 +155,8 @@ def __init__(
         context: Optional[Context] = None,
         executor: Optional[Union[ThreadPoolExecutor, int]] = None,
         logger: Optional[Union[BaseLogger, str]] = None,
-        benchmark: bool = False,
         _delay_engine_initialize: bool = False,  # internal use only
     ):
-        self._benchmark = benchmark
-        self._timer = PipelineTimer(enabled=benchmark, multi_inference=True)
-        self._timer.reset()
         self._model_path_orig = model_path
         self._model_path = model_path
         self._engine_type = engine_type
@@ -224,15 +218,12 @@ def __call__(self, *args, **kwargs) -> BaseModel:
                 "invalid kwarg engine_inputs. engine inputs determined "
                 f"by {self.__class__.__qualname__}.parse_inputs"
             )
-        self._timer.reset()
         timer = Timer()
 
         timer.start(InferencePhases.TOTAL_INFERENCE)
-        self._timer.start_inference_stage(InferencePhases.TOTAL_INFERENCE)
 
         # ------ PREPROCESSING ------
         timer.start(InferencePhases.PRE_PROCESS)
-        self._timer.start_inference_stage(InferencePhases.PRE_PROCESS)
         # parse inputs into input_schema
         pipeline_inputs = self.parse_inputs(*args, **kwargs)
 
@@ -254,7 +245,6 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         else:
             postprocess_kwargs = {}
         timer.stop(InferencePhases.PRE_PROCESS)
-        self._timer.stop_inference_stage(InferencePhases.PRE_PROCESS)
 
         self.log(
             identifier="engine_inputs",
@@ -270,18 +260,14 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         # ------ INFERENCE ------
         # split inputs into batches of size `self._batch_size`
         timer.start(InferencePhases.ENGINE_FORWARD)
-        self._timer.start_inference_stage(InferencePhases.ENGINE_FORWARD)
-        # Hack to enable inference with `cache_length` argument
-        # batches = self.split_engine_inputs(engine_inputs, self._batch_size)
-        batches = [engine_inputs]
+        batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = [self.engine_forward(x) for x in batches]
+        batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
         timer.stop(InferencePhases.ENGINE_FORWARD)
-        self._timer.stop_inference_stage(InferencePhases.ENGINE_FORWARD)
 
         self.log(
             identifier=f"{SystemGroups.INFERENCE_DETAILS}/input_batch_size_total",
@@ -306,7 +292,6 @@ def __call__(self, *args, **kwargs) -> BaseModel:
 
         # ------ POSTPROCESSING ------
         timer.start(InferencePhases.POST_PROCESS)
-        self._timer.start_inference_stage(InferencePhases.POST_PROCESS)
         pipeline_outputs = self.process_engine_outputs(
             engine_outputs, **postprocess_kwargs
         )
@@ -316,9 +301,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
                 f"{self.output_schema} found output of type {type(pipeline_outputs)}"
             )
         timer.stop(InferencePhases.POST_PROCESS)
-        self._timer.stop_inference_stage(InferencePhases.POST_PROCESS)
         timer.stop(InferencePhases.TOTAL_INFERENCE)
-        self._timer.stop_inference_stage(InferencePhases.TOTAL_INFERENCE)
 
         self.log(
             identifier="pipeline_outputs",
@@ -749,18 +732,6 @@ def engine_type(self) -> str:
         """
         return self._engine_type
 
-    @property
-    def timer(self) -> PipelineTimer:
-        return self._timer
-
-    @property
-    def benchmark(self) -> bool:
-        return self._benchmark
-
-    @benchmark.setter
-    def benchmark(self, value: bool):
-        self._benchmark = value
-
     def to_config(self) -> "PipelineConfig":
         """
         :return: PipelineConfig that can be used to reload this object
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
new file mode 100644
index 0000000000..a4a510b74f
--- /dev/null
+++ b/src/deepsparse/pipelines/text_generation.py
@@ -0,0 +1,542 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict, List, Optional, Tuple, Type
+
+import numpy
+from onnx import ValueInfoProto
+from pydantic import BaseModel, Field
+from transformers import AutoConfig, AutoTokenizer
+
+from deepsparse import Pipeline
+from deepsparse.transformers.helpers import (
+    get_onnx_path_and_configs,
+    overwrite_transformer_onnx_model_inputs,
+)
+from deepsparse.transformers.pipelines import TransformersPipeline
+from deepsparse.utils import get_output_names
+
+
+OPT_CACHE_HIDDEN_DIM = 64
+
+
+__all__ = ["TextGenerationPipeline"]
+
+
+def softmax(x: numpy.ndarray) -> numpy.ndarray:
+    """
+    Compute softmax values for x
+    :param x: input array
+    :return: softmax values
+    """
+    return numpy.exp(x) / numpy.sum(numpy.exp(x), axis=0)
+
+
+class TextGenerationInput(BaseModel):
+    sequence: str = Field(
+        description="The input sequence to generate the text from.",
+    )
+
+
+class TextGenerationOutput(BaseModel):
+    sequence: str = Field(
+        description="The generated text sequence.",
+    )
+
+
+@Pipeline.register(
+    task="text_generation",
+    task_aliases=["codegen", "opt"],
+)
+class TextGenerationPipeline(TransformersPipeline):
+    """
+    Pipeline for text generation tasks.
+
+    :param deterministic: if True, the pipeline will sample from
+        the probability distribution computed from the logits.
+        If False, the pipeline will get the next token by applying
+        an argmax function to the logits.
+    :param sampling_temperature: the temperature to use when sampling
+        from the probability distribution computed from the logits.
+        Higher values will result in more random samples.
+    :param max_generated_tokens: the maximum number of tokens to generate
+        given the input sequence. If None, the model will generate
+        tokens until the end of the sequence is reached.
+        Otherwise, it will generate up to the maximum number of tokens or end of
+        sequence is reached.
+    :param prompt_batch_threshold: the threshold for the ratio of running the prompt
+        as a single inference vs running the prompt auto-regressively.
+        If the number of input sequences divided by the max sequence length is
+        greater than the threshold, the prompt will be run as a single inference.
+        Default is None, which will always run auto-regressively.
+    :param force_max_tokens: if True, the pipeline will generate the maximum number
+        of tokens supplied even if the stop token is reached.
+    :param kwargs: kwargs to pass to the TransformersPipeline
+    """
+
+    def __init__(
+        self,
+        deterministic: bool = True,
+        sampling_temperature: float = 1.0,
+        max_generated_tokens: Optional[int] = 1024,
+        prompt_batch_threshold: float = None,
+        force_max_tokens: bool = False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, _delay_engine_initialize=True)
+
+        if self._batch_size != 1:
+            raise ValueError("Only batch size 1 is supported for generation pipelines")
+
+        self.deterministic = deterministic
+        self.sampling_temperature = sampling_temperature
+        self.max_generated_tokens = max_generated_tokens
+        self.prompt_batch_threshold = prompt_batch_threshold
+        self.force_max_tokens = force_max_tokens
+
+        self.engine = Pipeline.create_engine(
+            self.onnx_file_path,
+            self.engine_type,
+            self.engine_args,
+            self.context,
+        )
+
+        if prompt_batch_threshold is not None and prompt_batch_threshold < 1:
+            (
+                self.onnx_multitoken_path,
+                self._temp_model_directory,
+            ) = self._setup_onnx_multitoken_file_path()
+            self.multitoken_engine = Pipeline.create_engine(
+                self.onnx_multitoken_path,
+                self.engine_type,
+                self.engine_args,
+                self.context,
+            )
+        else:
+            self.onnx_multitoken_path = None
+            self.multitoken_engine = None
+
+        # override tokenizer to pad to left
+        self.tokenizer.padding_side = "left"
+
+    @staticmethod
+    def route_input_to_bucket(
+        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
+    ) -> Pipeline:
+        """
+        This method is used to route the input to the correct pipeline.
+
+        :param args: args to pass to the pipeline
+        :param input_schema: the input schema for the pipeline
+        :param pipelines: the list of pipelines to route the input to
+        :param kwargs: kwargs to pass to the pipeline
+        :return: the pipeline to route the input to
+        """
+        raise ValueError("Bucketing is not supported for generation pipelines")
+
+    @property
+    def input_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the input schema for the pipeline.
+
+        :return: the input schema for the pipeline
+        """
+        return TextGenerationInput
+
+    @property
+    def output_schema(self) -> Type[BaseModel]:
+        """
+        Property to return the output schema for the pipeline.
+
+        :return: the output schema for the pipeline
+        """
+        return TextGenerationOutput
+
+    def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
+        """
+        Convert the input schema for the pipeline to the inputs for the engine.
+
+        :param inputs: the input schema for the pipeline
+        :return: the inputs for the engine
+        """
+
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        input_tokens = self.tokenizer(
+            inputs.sequence,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding="max_length",
+        )
+
+        kv_cache = self._initialize_kv_cache(length=0)
+
+        attention_mask = input_tokens["attention_mask"]
+
+        positions = attention_mask.cumsum(1) * attention_mask
+        positions -= 1  # zero index - TODO: investigate if needed outside OPT
+        positions_input = dict(positions=positions)
+
+        input_tokens = {**input_tokens, **kv_cache, **positions_input}
+        engine_input = self.tokens_to_engine_input(input_tokens)
+
+        return engine_input
+
+    def process_engine_outputs(
+        self, engine_outputs: List[numpy.ndarray], **kwargs
+    ) -> TextGenerationOutput:
+        """
+        Convert the engine outputs to the output schema for the pipeline.
+
+        :param engine_outputs: the outputs from the engine
+        :return: the output schema for the pipeline
+        """
+        sequence = self.tokenizer.decode(engine_outputs[0][0], skip_special_tokens=True)
+        return TextGenerationOutput(sequence=sequence)
+
+    def engine_forward(
+        self, engine_inputs: List[numpy.ndarray], **kwargs
+    ) -> numpy.ndarray:
+        """
+        Run the forward pass on the engine.
+
+        :param engine_inputs: list of numpy inputs to
+            Pipeline engine forward pass
+        :return: A numpy array that contains the tokens generated by the model
+        """
+        # run the prompt through
+        tokens, kv_cache = self.prompt_inference(engine_inputs)
+        num_prompt_tokens = len(tokens) - 1
+
+        # create the generated output
+        max_tokens = (
+            self.max_generated_tokens
+            if self.max_generated_tokens and self.max_generated_tokens > 0
+            else 100 * self.sequence_length
+        )  # set safety for absolute max generation
+        generated = [tokens[-1]]
+
+        while len(generated) < max_tokens:
+            gen_token, kv_cache = self.autoregressive_inference(
+                tokens, kv_cache, num_prompt_tokens
+            )
+            tokens.append(gen_token)
+            generated.append(gen_token)
+
+            if gen_token == self.tokenizer.eos_token_id and not self.force_max_tokens:
+                break
+
+        return numpy.array([[generated]])
+
+    def prompt_inference(
+        self, engine_inputs: List[numpy.ndarray]
+    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+        """
+        An inference run that processes the prompt through the
+        model to generate the new token and populate the kv cache.
+
+        :param engine_inputs: the prompt (context) represented by a
+            list of numpy inputs to the engine
+        :return:
+            - the list of prompt tokens plus the new, generated token
+            - the kv cache that was populated during the inference
+        """
+        self.timer.start_inference_stage("prompt_inference")
+        # get tokens by attention mask
+        tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
+
+        new_token = None
+
+        if (
+            self.prompt_batch_threshold is None
+            or self.prompt_batch_threshold >= 1
+            or len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold
+        ):
+            # prompt size is small, run autoregressive inference to populate kv cache
+            run_tokens = []
+            kv_cache = {}
+            for token in tokens:
+                run_tokens.append(token)
+                new_token, kv_cache = self.autoregressive_inference(
+                    run_tokens, kv_cache, num_prompt_tokens=0
+                )
+        else:
+            # larger prompt size, run through multi-token engine in single pass
+            self.timer.start_inference_stage("multitoken_engine")
+            logits, *cache_values = self.multitoken_engine(engine_inputs, val_inp=False)
+            self.timer.stop_inference_stage("multitoken_engine")
+            kv_cache = self.assemble_kv_cache(cache_values, tokens)
+            new_token = self.generate_token(logits[0, -1])
+
+        tokens.append(new_token)
+        self.timer.stop_inference_stage("prompt_inference")
+
+        return tokens, kv_cache
+
+    def autoregressive_inference(
+        self,
+        tokens: List[int],
+        kv_cache: Dict[str, numpy.ndarray],
+        num_prompt_tokens: int,
+    ) -> Tuple[int, Dict[str, numpy.ndarray]]:
+        """
+        An inference run that processes the last token and the kv cache to
+        generate a new token and update the kv cache.
+
+        :param tokens: The current context (prompt + generated tokens so far)
+        :param kv_cache: The key-value cache from the previous inference run
+        :param num_prompt_tokens: number of tokens in the initial prompt
+        :return:
+            - the new, generated token
+            - the kv cache that was populated during the inference
+        """
+        self.timer.start_inference_stage("autoregressive_inference")
+        new_token = tokens[-1]
+
+        # padding is added to left, so attention mask is 1s from the
+        # right up to the number of total tokens (prompt + generated)
+        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
+        num_tokens_processed = min(len(tokens), self.sequence_length)  # cap by seq len
+        attention_mask[:, -num_tokens_processed:] = 1
+        # the position of the token is the number of tokens - 1 (zero indexed)
+        positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
+        if num_prompt_tokens == 0:
+            # no prompt tokens, we are currently processing the prompt
+            positions -= 1
+
+        engine_inputs = {
+            "input_ids": numpy.array([[new_token]]),
+            "attention_mask": attention_mask,
+            "positions": positions,
+        }
+        # initialize the kv cache if it is empty
+        # (when the prompt is processed with the single-token engine)
+        kv_cache = (
+            self._initialize_kv_cache(length=self.sequence_length - 1)
+            if kv_cache == {}
+            else kv_cache
+        )
+
+        engine_inputs.update(kv_cache)
+        engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
+
+        self.timer.start_inference_stage("autoregressive_inference_engine")
+        new_logits, *cache_values = self.engine(engine_inputs, val_inp=False)
+        self.timer.stop_inference_stage("autoregressive_inference_engine")
+        kv_cache = self.assemble_kv_cache(cache_values, tokens)
+
+        # Obtain the next token from the logits
+        generated_token = self.generate_token(new_logits[0, 0, :])
+        self.timer.stop_inference_stage("autoregressive_inference")
+
+        return generated_token, kv_cache
+
+    def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
+        """
+        Samples a token from the logits using the sampling temperature.
+
+        :param logits: the logits from the model with shape (vocab_size,)
+
+        :return: the sampled token
+        """
+        if self.deterministic:
+            return numpy.argmax(logits)
+
+        logits /= self.sampling_temperature
+
+        probs = softmax(logits)
+
+        return numpy.random.choice(len(probs), p=probs)
+
+    def setup_onnx_file_path(self) -> str:
+        """
+        Parses ONNX, tokenizer, and config file paths from the given `model_path`.
+        Supports sparsezoo stubs
+
+        :return: file path to the processed ONNX file for the engine to compile
+        """
+        onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
+            self.model_path,
+            require_configs=True,
+        )
+
+        self.config = AutoConfig.from_pretrained(config_path)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            model_max_length=self.sequence_length,
+        )
+        self.config_path = os.path.join(config_path, "config.json")
+
+        (
+            onnx_path,
+            self.onnx_input_names,
+            self._temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            onnx_path,
+            max_length=self.sequence_length,
+            load_external_data=False,
+            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
+            custom_input_overwrite_func_kwargs=dict(
+                multitoken=False,
+                num_attention_heads=self.config.num_attention_heads,
+                hidden_dims=OPT_CACHE_HIDDEN_DIM,
+            ),
+        )
+
+        return onnx_path
+
+    def _initialize_kv_cache(self, length: int) -> Dict[str, numpy.ndarray]:
+        # initialize empty kv cache of size
+        # (num_attention_heads, length, hidden_dims)
+        empty_kv_cache_tensor = numpy.zeros(
+            (
+                self.config.num_attention_heads,
+                length,
+                OPT_CACHE_HIDDEN_DIM,
+            ),
+            dtype=numpy.float32,
+        )
+
+        cache_keys = [
+            output_name.replace("present", "past_key_values")
+            for output_name in self.engine.output_names
+            if output_name.startswith("present")
+        ]
+        return {key: empty_kv_cache_tensor for key in cache_keys}
+
+    def assemble_kv_cache(
+        self,
+        cache_values: List[numpy.ndarray],
+        tokens: List[int],
+        consider_sos_token: bool = False,
+    ) -> Dict[str, numpy.ndarray]:
+        """
+        Restructure the kv cache values from the engine output, so
+        that it can be passed to the engine in the next inference run.
+
+        KV Cache concatenation adds an extra length dimension to the output
+        cache, that should be deleted after every inference run.
+
+        There are two modes:
+        1. Some values in the cache represent dummy (pad) tokens, padding is
+            to the left, so the left-most cache value is deleted
+        2. The cache is saturated with non-dummy (meaningful) tokens:
+            -   if there is a mandatory start-of-sequence (SOS) token,
+                we delete the left-most cache value that is not a cache
+                corresponding to SOS token.
+            -   otherwise we delete from the left as in (1)
+
+        :param cache_values: the cache values from the engine output
+        :param tokens: the tokens from the previous inference run
+        :param consider_sos_token: whether to consider the SOS token in the cache
+        :return kv_cache: the restructured cache values
+        """
+        for idx, cache_value in enumerate(cache_values):
+            if len(tokens) > self.sequence_length - 1:
+                idx_to_remove = int(not consider_sos_token)
+            else:
+                idx_to_remove = 0
+
+            # TODO: see if we can do in-place
+            cache_values[idx] = numpy.delete(cache_value, idx_to_remove, 1)
+
+        cache_keys = [
+            name.replace("present", "past_key_values")
+            for name in self.engine.output_names
+            if name.startswith("present")
+        ]
+        kv_cache = dict(zip(cache_keys, cache_values))
+
+        return kv_cache
+
+    @staticmethod
+    def overwrite_onnx_model_inputs(
+        external_inputs: List[ValueInfoProto],
+        batch_size: int,
+        sequence_length: int,
+        num_attention_heads: int,
+        hidden_dims: int,
+        multitoken: bool = True,
+    ) -> List[str]:
+        """
+        Overwrite the input shape of the onnx model. This function
+        is particular for the model with inputs:
+            - input_ids
+            - attention_mask
+            - positions
+            - past_key_values (x N)
+
+        :param external_inputs: The external inputs of the onnx model
+        :param batch_size: The batch size of the input
+        :param sequence_length: The sequence length of the input
+        :param num_attention_heads: The number of attention heads
+            of the model (required to set the shape of the kv_cache)
+        :param hidden_dims: The hidden dimensions of the model
+            (required to set the shape of the kv_cache)
+        :param multitoken: A boolean flag that indicates whether
+            we are overwriting inputs to the model for multi-token
+            inference (sequence_len > 1) or single token inference
+            (sequence_len = 1).
+        :return: the input names of the onnx model
+        """
+        input_names = []
+        for external_input in external_inputs:
+            if external_input.name in ["input_ids", "positions"]:
+                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+                external_input.type.tensor_type.shape.dim[1].dim_value = (
+                    sequence_length if multitoken else 1
+                )
+            elif external_input.name == "attention_mask":
+                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+                # regardless of multi-token or not,
+                # we always provide full attention mask
+                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
+            elif external_input.name.startswith("past_key_values"):
+                external_input.type.tensor_type.shape.dim[
+                    0
+                ].dim_value = num_attention_heads
+                # empty cache for multi-token runs,
+                # otherwise max cache len is max len - 1
+                external_input.type.tensor_type.shape.dim[1].dim_value = (
+                    0 if multitoken else sequence_length - 1
+                )
+                external_input.type.tensor_type.shape.dim[2].dim_value = hidden_dims
+            else:
+                raise ValueError(
+                    f"Unexpected external input name: {external_input.name}"
+                )
+
+            input_names.append(external_input.name)
+        return input_names
+
+    def _setup_onnx_multitoken_file_path(self) -> str:
+        (
+            onnx_path,
+            _,
+            _temp_model_directory,
+        ) = overwrite_transformer_onnx_model_inputs(
+            self.onnx_file_path,
+            max_length=self.sequence_length,
+            load_external_data=False,
+            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
+            custom_input_overwrite_func_kwargs=dict(
+                multitoken=True,
+                num_attention_heads=self.config.num_attention_heads,
+                hidden_dims=OPT_CACHE_HIDDEN_DIM,
+            ),
+        )
+
+        return onnx_path, _temp_model_directory
diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
index 2aee2ddd9d..11db404e89 100644
--- a/src/deepsparse/tasks.py
+++ b/src/deepsparse/tasks.py
@@ -95,9 +95,10 @@ class SupportedTasks:
         ),
     )
 
-    text_generation = namedtuple("text_generation", ["opt", "codegen"])(
+    text_generation = namedtuple("text_generation", ["opt", "codegen", "bloom"])(
         codegen=AliasedTask("codegen", []),
         opt=AliasedTask("opt", []),
+        bloom=AliasedTask("bloom", []),
     )
 
     image_classification = namedtuple("image_classification", ["image_classification"])(
diff --git a/src/deepsparse/timing/pipeline_timer.py b/src/deepsparse/timing/pipeline_timer.py
deleted file mode 100644
index 13c4f5850b..0000000000
--- a/src/deepsparse/timing/pipeline_timer.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-from typing import Dict, List
-
-
-__all__ = ["InferenceTimer", "PipelineTimer"]
-
-
-class InferenceTimer:
-    def __init__(self):
-        self._staged_start_times = {}
-        self._staged_stop_times = {}
-
-    def __repr__(self):
-        return f"InferenceTimer({self.times})"
-
-    @property
-    def stages(self) -> List[str]:
-        return list(self._staged_start_times.keys())
-
-    @property
-    def times(self) -> Dict[str, float]:
-        return {stage: self.stage_average_time(stage) for stage in self.stages}
-
-    @property
-    def all_times(self) -> Dict[str, List[float]]:
-        return {stage: self.stage_times(stage) for stage in self.stages}
-
-    def clear(self):
-        self._staged_start_times.clear()
-        self._staged_stop_times.clear()
-
-    def has_stage(self, stage: str) -> bool:
-        return stage in self.stages
-
-    def start(self, stage: str):
-        if stage not in self._staged_start_times:
-            self._staged_start_times[stage] = []
-            self._staged_stop_times[stage] = []
-
-        if len(self._staged_start_times[stage]) != len(self._staged_stop_times[stage]):
-            raise ValueError(
-                f"Attempting to start {stage} before a previous has been stopped:"
-                f" start times len({self._staged_start_times[stage]});"
-                f" stop times len({self._staged_stop_times[stage]})"
-            )
-
-        self._staged_start_times[stage].append(time.perf_counter())
-
-    def stop(self, stage: str):
-        if stage not in self._staged_start_times:
-            raise ValueError(
-                "Attempting to stop a stage that has not been started: " f"{stage}"
-            )
-
-        if (
-            len(self._staged_start_times[stage])
-            != len(self._staged_stop_times[stage]) + 1
-        ):
-            raise ValueError(
-                f"Attempting to stop {stage} before a previous has been started:"
-                f" start times len({self._staged_start_times[stage]});"
-                f" stop times len({self._staged_stop_times[stage]})"
-            )
-
-        self._staged_stop_times[stage].append(time.perf_counter())
-
-    def stage_times(self, stage: str) -> List[float]:
-        if stage not in self._staged_start_times:
-            raise ValueError(
-                "Attempting to get time deltas for a stage that has not been started: "
-                f"{stage}"
-            )
-
-        if len(self._staged_start_times[stage]) != len(self._staged_stop_times[stage]):
-            raise ValueError(
-                "Attempting to get time deltas for a stage that has not been stopped: "
-                f"{stage}"
-            )
-
-        return [
-            self._staged_stop_times[stage][i] - self._staged_start_times[stage][i]
-            for i in range(len(self._staged_start_times[stage]))
-        ]
-
-    def stage_average_time(self, stage: str) -> float:
-        times = self.stage_times(stage)
-
-        return sum(times) / len(times)
-
-
-class PipelineTimer:
-    def __init__(self, enabled: bool = True, multi_inference: bool = False):
-        self._multi_inference = multi_inference
-        self._enabled = enabled
-        self._timers = []
-
-    def __repr__(self):
-        return f"PipelineTimer({self.times})"
-
-    @property
-    def enabled(self):
-        return self._enabled
-
-    @enabled.setter
-    def enabled(self, value):
-        self._enabled = value
-
-    @property
-    def multi_inference(self):
-        return self._multi_inference
-
-    @multi_inference.setter
-    def multi_inference(self, value):
-        self._multi_inference = value
-
-    @property
-    def current_inference(self) -> InferenceTimer:
-        return self._timers[-1] if self._timers else None
-
-    @property
-    def inferences(self) -> List[InferenceTimer]:
-        return self._timers
-
-    @property
-    def stages(self) -> List[str]:
-        stages = set()
-
-        for timer in self._timers:
-            stages.update(timer.stages)
-
-        return list(stages)
-
-    @property
-    def times(self) -> Dict[str, float]:
-        all_times = self.all_times
-
-        return {
-            stage: sum(all_times[stage]) / len(all_times[stage])
-            for stage in self.stages
-        }
-
-    @property
-    def all_times(self) -> Dict[str, List[float]]:
-        all_times = {stage: [] for stage in self.stages}
-
-        for timer in self._timers:
-            for stage, times in timer.all_times.items():
-                all_times[stage].extend(times)
-
-        return all_times
-
-    def reset(self):
-        if not self._enabled:
-            return
-
-        self._check_start_inference()
-
-        if self.multi_inference:
-            self._timers.append(InferenceTimer())
-        else:
-            self._timers[0].clear()
-
-    def start_inference_stage(self, stage: str):
-        if not self._enabled:
-            return
-
-        self._check_start_inference()
-        self._timers[-1].start(stage)
-
-    def stop_inference_stage(self, stage: str):
-        if not self._enabled:
-            return
-
-        self._check_start_inference()
-        self._timers[-1].stop(stage)
-
-    def _check_start_inference(self):
-        if not self.current_inference:
-            self._timers.append(InferenceTimer())
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 9effbc3315..4dc8be0e6d 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -29,9 +29,6 @@
 from deepsparse.utils import get_output_names
 
 
-OPT_CACHE_HIDDEN_DIM = 64
-
-
 __all__ = ["TextGenerationPipeline"]
 
 
@@ -254,7 +251,6 @@ def prompt_inference(
             - the list of prompt tokens plus the new, generated token
             - the kv cache that was populated during the inference
         """
-        self.timer.start_inference_stage("prompt_inference")
         # get tokens by attention mask
         tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
 
@@ -275,14 +271,11 @@ def prompt_inference(
                 )
         else:
             # larger prompt size, run through multi-token engine in single pass
-            self.timer.start_inference_stage("multitoken_engine")
             logits, *cache_values = self.multitoken_engine(engine_inputs, val_inp=False)
-            self.timer.stop_inference_stage("multitoken_engine")
             kv_cache = self.assemble_kv_cache(cache_values, tokens)
             new_token = self.generate_token(logits[0, -1])
 
         tokens.append(new_token)
-        self.timer.stop_inference_stage("prompt_inference")
 
         return tokens, kv_cache
 
@@ -303,7 +296,6 @@ def autoregressive_inference(
             - the new, generated token
             - the kv cache that was populated during the inference
         """
-        self.timer.start_inference_stage("autoregressive_inference")
         new_token = tokens[-1]
 
         # padding is added to left, so attention mask is 1s from the
@@ -329,18 +321,14 @@ def autoregressive_inference(
             if kv_cache == {}
             else kv_cache
         )
-
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
-        self.timer.start_inference_stage("autoregressive_inference_engine")
         new_logits, *cache_values = self.engine(engine_inputs, val_inp=False)
-        self.timer.stop_inference_stage("autoregressive_inference_engine")
         kv_cache = self.assemble_kv_cache(cache_values, tokens)
 
         # Obtain the next token from the logits
         generated_token = self.generate_token(new_logits[0, 0, :])
-        self.timer.stop_inference_stage("autoregressive_inference")
 
         return generated_token, kv_cache
 
@@ -379,7 +367,6 @@ def setup_onnx_file_path(self) -> str:
             self.model_path,
             model_max_length=self.sequence_length,
         )
-        self.config_path = os.path.join(config_path, "config.json")
 
         (
             onnx_path,
@@ -392,8 +379,6 @@ def setup_onnx_file_path(self) -> str:
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
                 multitoken=False,
-                num_attention_heads=self.config.num_attention_heads,
-                hidden_dims=OPT_CACHE_HIDDEN_DIM,
             ),
         )
 
@@ -401,7 +386,8 @@ def setup_onnx_file_path(self) -> str:
 
     def _initialize_kv_cache(self, length: int) -> Dict[str, numpy.ndarray]:
         # initialize empty kv cache of size
-        # (num_attention_heads, length, hidden_dims)
+        # (batch_size, num_attention_heads, length, hidden_dims)
+        raise NotImplementedError()
         empty_kv_cache_tensor = numpy.zeros(
             (
                 self.config.num_attention_heads,
@@ -451,7 +437,6 @@ def assemble_kv_cache(
             else:
                 idx_to_remove = 0
 
-            # TODO: see if we can do in-place
             cache_values[idx] = numpy.delete(cache_value, idx_to_remove, 1)
 
         cache_keys = [
@@ -468,8 +453,6 @@ def overwrite_onnx_model_inputs(
         external_inputs: List[ValueInfoProto],
         batch_size: int,
         sequence_length: int,
-        num_attention_heads: int,
-        hidden_dims: int,
         multitoken: bool = True,
     ) -> List[str]:
         """
@@ -483,10 +466,6 @@ def overwrite_onnx_model_inputs(
         :param external_inputs: The external inputs of the onnx model
         :param batch_size: The batch size of the input
         :param sequence_length: The sequence length of the input
-        :param num_attention_heads: The number of attention heads
-            of the model (required to set the shape of the kv_cache)
-        :param hidden_dims: The hidden dimensions of the model
-            (required to set the shape of the kv_cache)
         :param multitoken: A boolean flag that indicates whether
             we are overwriting inputs to the model for multi-token
             inference (sequence_len > 1) or single token inference
@@ -495,31 +474,24 @@ def overwrite_onnx_model_inputs(
         """
         input_names = []
         for external_input in external_inputs:
+            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
             if external_input.name in ["input_ids", "positions"]:
-                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
                 external_input.type.tensor_type.shape.dim[1].dim_value = (
                     sequence_length if multitoken else 1
                 )
             elif external_input.name == "attention_mask":
-                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                # regardless of multi-token or not,
-                # we always provide full attention mask
+                # regardless of multi-token or not, always provide full attention mask
                 external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
             elif external_input.name.startswith("past_key_values"):
-                external_input.type.tensor_type.shape.dim[
-                    0
-                ].dim_value = num_attention_heads
                 # empty cache for multi-token runs,
                 # otherwise max cache len is max len - 1
-                external_input.type.tensor_type.shape.dim[1].dim_value = (
+                external_input.type.tensor_type.shape.dim[2].dim_value = (
                     0 if multitoken else sequence_length - 1
                 )
-                external_input.type.tensor_type.shape.dim[2].dim_value = hidden_dims
             else:
                 raise ValueError(
                     f"Unexpected external input name: {external_input.name}"
                 )
-
             input_names.append(external_input.name)
         return input_names
 
@@ -535,8 +507,6 @@ def _setup_onnx_multitoken_file_path(self) -> str:
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
                 multitoken=True,
-                num_attention_heads=self.config.num_attention_heads,
-                hidden_dims=OPT_CACHE_HIDDEN_DIM,
             ),
         )
 
diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index f68ff259d0..5ce98ce6a0 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -53,18 +53,22 @@
 
 
 @contextlib.contextmanager
-def save_onnx_to_temp_files(model: Model, with_external_data=True) -> str:
+def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) -> str:
     """
     Save model to a temporary file. Works for models with external data.
+
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
     shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
+    _LOGGER.info(f"Saving model to temporary directory: {tempfile.tempdir}")
+
     if with_external_data:
         external_data = os.path.join(
             tempfile.tempdir, next(tempfile._get_candidate_names())
         )
         has_external_data = save_onnx(model, shaped_model.name, external_data)
+        _LOGGER.info(f"Saving external data to temporary directory: {external_data}")
     else:
         has_external_data = save_onnx(model, shaped_model.name)
     try:
@@ -129,7 +133,7 @@ def get_external_inputs(onnx_filepath: str) -> List:
     :param onnx_filepath: File path to ONNX model
     :return: List of input objects
     """
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -144,7 +148,7 @@ def get_external_outputs(onnx_filepath: str) -> List:
     :param onnx_filepath: File path to ONNX model
     :return: List of output objects
     """
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath)
     return [output for output in model.graph.output]
 
 
@@ -194,17 +198,29 @@ def generate_random_inputs(
     return input_data_list
 
 
+@contextlib.contextmanager
 def override_onnx_batch_size(
-    onnx_filepath: str, batch_size: int, inplace: bool = False
+    onnx_filepath: str,
+    batch_size: int,
+    inplace: bool = True,
 ) -> str:
     """
     Rewrite batch sizes of ONNX model, saving the modified model and returning its path
-    :param onnx_filepath: File path to ONNX model
+
+    :param onnx_filepath: File path to ONNX model. If the graph is to be
+        modified in-place, only the model graph will be loaded and modified.
+        Otherwise, the entire model will be loaded and modified, so that
+        external data are saved along the model graph.
     :param batch_size: Override for the batch size dimension
-    :param inplace: If True, overwrite the original model file
-    :return: File path to modified ONNX model
+    :param inplace: If True, overwrite the original model file.
+        Else, save the modified model to a temporary file.
+    :return: File path to modified ONNX model.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath, load_external_data=not inplace)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -214,30 +230,40 @@ def override_onnx_batch_size(
         external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
 
     if inplace:
-        onnx.save(model, onnx_filepath)
-        return onnx_filepath
-
-    # Save modified model, this will be cleaned up when context is exited
-    return save_onnx_to_temp_files(model, with_external_data=False)
+        _LOGGER.info(
+            f"Overwriting in-place the batch size of the model at {onnx_filepath}"
+        )
+        save_onnx(model, onnx_filepath)
+        yield onnx_filepath
+    else:
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def override_onnx_input_shapes(
     onnx_filepath: str,
     input_shapes: Union[List[int], List[List[int]]],
-    inplace: bool = False,
+    inplace: bool = True,
 ) -> str:
     """
     Rewrite input shapes of ONNX model, saving the modified model and returning its path
-    :param onnx_filepath: File path to ONNX model
+
+    :param onnx_filepath: File path to ONNX model. If the graph is to be
+        modified in-place, only the model graph will be loaded and modified.
+        Otherwise, the entire model will be loaded and modified, so that
+        external data are saved along the model graph.
     :param input_shapes: Override for model's input shapes
     :param inplace: If True, overwrite the original model file
-    :return: File path to modified ONNX model
+    :return: File path to modified ONNX model.
+        If inplace is True,
+        the modified model will be saved to the same path as the original
+        model. Else the modified model will be saved to a
+        temporary file.
     """
 
     if input_shapes is None:
         return onnx_filepath
 
-    model = onnx.load(onnx_filepath, load_external_data=False)
+    model = onnx.load(onnx_filepath, load_external_data=not inplace)
     all_inputs = model.graph.input
     initializer_input_names = [node.name for node in model.graph.initializer]
     external_inputs = [
@@ -273,11 +299,17 @@ def override_onnx_input_shapes(
             dim.dim_value = input_shapes[input_idx][dim_idx]
 
     if inplace:
+        _LOGGER.info(
+            "Overwriting in-place the input shapes of the model " f"at {onnx_filepath}"
+        )
         onnx.save(model, onnx_filepath)
         return onnx_filepath
-
-    # Save modified model, this will be cleaned up when context is exited
-    return save_onnx_to_temp_files(model, with_external_data=False)
+    else:
+        _LOGGER.info(
+            f"Saving the input shapes of the model at {onnx_filepath} "
+            f"to a temporary file"
+        )
+        return save_onnx_to_temp_files(model, with_external_data=not inplace)
 
 
 def truncate_onnx_model(
@@ -356,6 +388,7 @@ def truncate_onnx_model(
             output.type.tensor_type.shape.Clear()
 
     # save and check model
+    _LOGGER.debug(f"Saving truncated model to {output_filepath}")
     save_onnx(extracted_model, output_filepath, "external_data")
     validate_onnx(output_filepath)
 

From a610faf4801fc2e4933711431002de10feeb018e Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 13 Jun 2023 11:52:53 +0000
Subject: [PATCH 48/68] tested with actual model

---
 src/deepsparse/pipeline.py                    |   2 +-
 src/deepsparse/pipelines/text_generation.py   | 542 ------------------
 src/deepsparse/transformers/helpers.py        |   6 +-
 .../transformers/pipelines/text_generation.py |  26 +-
 4 files changed, 19 insertions(+), 557 deletions(-)
 delete mode 100644 src/deepsparse/pipelines/text_generation.py

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 3fac78592d..4ffda8800d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(x) for x in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/pipelines/text_generation.py b/src/deepsparse/pipelines/text_generation.py
deleted file mode 100644
index a4a510b74f..0000000000
--- a/src/deepsparse/pipelines/text_generation.py
+++ /dev/null
@@ -1,542 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Dict, List, Optional, Tuple, Type
-
-import numpy
-from onnx import ValueInfoProto
-from pydantic import BaseModel, Field
-from transformers import AutoConfig, AutoTokenizer
-
-from deepsparse import Pipeline
-from deepsparse.transformers.helpers import (
-    get_onnx_path_and_configs,
-    overwrite_transformer_onnx_model_inputs,
-)
-from deepsparse.transformers.pipelines import TransformersPipeline
-from deepsparse.utils import get_output_names
-
-
-OPT_CACHE_HIDDEN_DIM = 64
-
-
-__all__ = ["TextGenerationPipeline"]
-
-
-def softmax(x: numpy.ndarray) -> numpy.ndarray:
-    """
-    Compute softmax values for x
-    :param x: input array
-    :return: softmax values
-    """
-    return numpy.exp(x) / numpy.sum(numpy.exp(x), axis=0)
-
-
-class TextGenerationInput(BaseModel):
-    sequence: str = Field(
-        description="The input sequence to generate the text from.",
-    )
-
-
-class TextGenerationOutput(BaseModel):
-    sequence: str = Field(
-        description="The generated text sequence.",
-    )
-
-
-@Pipeline.register(
-    task="text_generation",
-    task_aliases=["codegen", "opt"],
-)
-class TextGenerationPipeline(TransformersPipeline):
-    """
-    Pipeline for text generation tasks.
-
-    :param deterministic: if True, the pipeline will sample from
-        the probability distribution computed from the logits.
-        If False, the pipeline will get the next token by applying
-        an argmax function to the logits.
-    :param sampling_temperature: the temperature to use when sampling
-        from the probability distribution computed from the logits.
-        Higher values will result in more random samples.
-    :param max_generated_tokens: the maximum number of tokens to generate
-        given the input sequence. If None, the model will generate
-        tokens until the end of the sequence is reached.
-        Otherwise, it will generate up to the maximum number of tokens or end of
-        sequence is reached.
-    :param prompt_batch_threshold: the threshold for the ratio of running the prompt
-        as a single inference vs running the prompt auto-regressively.
-        If the number of input sequences divided by the max sequence length is
-        greater than the threshold, the prompt will be run as a single inference.
-        Default is None, which will always run auto-regressively.
-    :param force_max_tokens: if True, the pipeline will generate the maximum number
-        of tokens supplied even if the stop token is reached.
-    :param kwargs: kwargs to pass to the TransformersPipeline
-    """
-
-    def __init__(
-        self,
-        deterministic: bool = True,
-        sampling_temperature: float = 1.0,
-        max_generated_tokens: Optional[int] = 1024,
-        prompt_batch_threshold: float = None,
-        force_max_tokens: bool = False,
-        **kwargs,
-    ):
-        super().__init__(**kwargs, _delay_engine_initialize=True)
-
-        if self._batch_size != 1:
-            raise ValueError("Only batch size 1 is supported for generation pipelines")
-
-        self.deterministic = deterministic
-        self.sampling_temperature = sampling_temperature
-        self.max_generated_tokens = max_generated_tokens
-        self.prompt_batch_threshold = prompt_batch_threshold
-        self.force_max_tokens = force_max_tokens
-
-        self.engine = Pipeline.create_engine(
-            self.onnx_file_path,
-            self.engine_type,
-            self.engine_args,
-            self.context,
-        )
-
-        if prompt_batch_threshold is not None and prompt_batch_threshold < 1:
-            (
-                self.onnx_multitoken_path,
-                self._temp_model_directory,
-            ) = self._setup_onnx_multitoken_file_path()
-            self.multitoken_engine = Pipeline.create_engine(
-                self.onnx_multitoken_path,
-                self.engine_type,
-                self.engine_args,
-                self.context,
-            )
-        else:
-            self.onnx_multitoken_path = None
-            self.multitoken_engine = None
-
-        # override tokenizer to pad to left
-        self.tokenizer.padding_side = "left"
-
-    @staticmethod
-    def route_input_to_bucket(
-        *args, input_schema: BaseModel, pipelines: List[Pipeline], **kwargs
-    ) -> Pipeline:
-        """
-        This method is used to route the input to the correct pipeline.
-
-        :param args: args to pass to the pipeline
-        :param input_schema: the input schema for the pipeline
-        :param pipelines: the list of pipelines to route the input to
-        :param kwargs: kwargs to pass to the pipeline
-        :return: the pipeline to route the input to
-        """
-        raise ValueError("Bucketing is not supported for generation pipelines")
-
-    @property
-    def input_schema(self) -> Type[BaseModel]:
-        """
-        Property to return the input schema for the pipeline.
-
-        :return: the input schema for the pipeline
-        """
-        return TextGenerationInput
-
-    @property
-    def output_schema(self) -> Type[BaseModel]:
-        """
-        Property to return the output schema for the pipeline.
-
-        :return: the output schema for the pipeline
-        """
-        return TextGenerationOutput
-
-    def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
-        """
-        Convert the input schema for the pipeline to the inputs for the engine.
-
-        :param inputs: the input schema for the pipeline
-        :return: the inputs for the engine
-        """
-
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        input_tokens = self.tokenizer(
-            inputs.sequence,
-            return_tensors="np",
-            max_length=self.sequence_length,
-            padding="max_length",
-        )
-
-        kv_cache = self._initialize_kv_cache(length=0)
-
-        attention_mask = input_tokens["attention_mask"]
-
-        positions = attention_mask.cumsum(1) * attention_mask
-        positions -= 1  # zero index - TODO: investigate if needed outside OPT
-        positions_input = dict(positions=positions)
-
-        input_tokens = {**input_tokens, **kv_cache, **positions_input}
-        engine_input = self.tokens_to_engine_input(input_tokens)
-
-        return engine_input
-
-    def process_engine_outputs(
-        self, engine_outputs: List[numpy.ndarray], **kwargs
-    ) -> TextGenerationOutput:
-        """
-        Convert the engine outputs to the output schema for the pipeline.
-
-        :param engine_outputs: the outputs from the engine
-        :return: the output schema for the pipeline
-        """
-        sequence = self.tokenizer.decode(engine_outputs[0][0], skip_special_tokens=True)
-        return TextGenerationOutput(sequence=sequence)
-
-    def engine_forward(
-        self, engine_inputs: List[numpy.ndarray], **kwargs
-    ) -> numpy.ndarray:
-        """
-        Run the forward pass on the engine.
-
-        :param engine_inputs: list of numpy inputs to
-            Pipeline engine forward pass
-        :return: A numpy array that contains the tokens generated by the model
-        """
-        # run the prompt through
-        tokens, kv_cache = self.prompt_inference(engine_inputs)
-        num_prompt_tokens = len(tokens) - 1
-
-        # create the generated output
-        max_tokens = (
-            self.max_generated_tokens
-            if self.max_generated_tokens and self.max_generated_tokens > 0
-            else 100 * self.sequence_length
-        )  # set safety for absolute max generation
-        generated = [tokens[-1]]
-
-        while len(generated) < max_tokens:
-            gen_token, kv_cache = self.autoregressive_inference(
-                tokens, kv_cache, num_prompt_tokens
-            )
-            tokens.append(gen_token)
-            generated.append(gen_token)
-
-            if gen_token == self.tokenizer.eos_token_id and not self.force_max_tokens:
-                break
-
-        return numpy.array([[generated]])
-
-    def prompt_inference(
-        self, engine_inputs: List[numpy.ndarray]
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
-        """
-        An inference run that processes the prompt through the
-        model to generate the new token and populate the kv cache.
-
-        :param engine_inputs: the prompt (context) represented by a
-            list of numpy inputs to the engine
-        :return:
-            - the list of prompt tokens plus the new, generated token
-            - the kv cache that was populated during the inference
-        """
-        self.timer.start_inference_stage("prompt_inference")
-        # get tokens by attention mask
-        tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
-
-        new_token = None
-
-        if (
-            self.prompt_batch_threshold is None
-            or self.prompt_batch_threshold >= 1
-            or len(tokens) / float(self.sequence_length) < self.prompt_batch_threshold
-        ):
-            # prompt size is small, run autoregressive inference to populate kv cache
-            run_tokens = []
-            kv_cache = {}
-            for token in tokens:
-                run_tokens.append(token)
-                new_token, kv_cache = self.autoregressive_inference(
-                    run_tokens, kv_cache, num_prompt_tokens=0
-                )
-        else:
-            # larger prompt size, run through multi-token engine in single pass
-            self.timer.start_inference_stage("multitoken_engine")
-            logits, *cache_values = self.multitoken_engine(engine_inputs, val_inp=False)
-            self.timer.stop_inference_stage("multitoken_engine")
-            kv_cache = self.assemble_kv_cache(cache_values, tokens)
-            new_token = self.generate_token(logits[0, -1])
-
-        tokens.append(new_token)
-        self.timer.stop_inference_stage("prompt_inference")
-
-        return tokens, kv_cache
-
-    def autoregressive_inference(
-        self,
-        tokens: List[int],
-        kv_cache: Dict[str, numpy.ndarray],
-        num_prompt_tokens: int,
-    ) -> Tuple[int, Dict[str, numpy.ndarray]]:
-        """
-        An inference run that processes the last token and the kv cache to
-        generate a new token and update the kv cache.
-
-        :param tokens: The current context (prompt + generated tokens so far)
-        :param kv_cache: The key-value cache from the previous inference run
-        :param num_prompt_tokens: number of tokens in the initial prompt
-        :return:
-            - the new, generated token
-            - the kv cache that was populated during the inference
-        """
-        self.timer.start_inference_stage("autoregressive_inference")
-        new_token = tokens[-1]
-
-        # padding is added to left, so attention mask is 1s from the
-        # right up to the number of total tokens (prompt + generated)
-        attention_mask = numpy.zeros((1, self.sequence_length), dtype=numpy.int64)
-        num_tokens_processed = min(len(tokens), self.sequence_length)  # cap by seq len
-        attention_mask[:, -num_tokens_processed:] = 1
-        # the position of the token is the number of tokens - 1 (zero indexed)
-        positions = numpy.array([[len(tokens)]], dtype=numpy.int64)
-        if num_prompt_tokens == 0:
-            # no prompt tokens, we are currently processing the prompt
-            positions -= 1
-
-        engine_inputs = {
-            "input_ids": numpy.array([[new_token]]),
-            "attention_mask": attention_mask,
-            "positions": positions,
-        }
-        # initialize the kv cache if it is empty
-        # (when the prompt is processed with the single-token engine)
-        kv_cache = (
-            self._initialize_kv_cache(length=self.sequence_length - 1)
-            if kv_cache == {}
-            else kv_cache
-        )
-
-        engine_inputs.update(kv_cache)
-        engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
-
-        self.timer.start_inference_stage("autoregressive_inference_engine")
-        new_logits, *cache_values = self.engine(engine_inputs, val_inp=False)
-        self.timer.stop_inference_stage("autoregressive_inference_engine")
-        kv_cache = self.assemble_kv_cache(cache_values, tokens)
-
-        # Obtain the next token from the logits
-        generated_token = self.generate_token(new_logits[0, 0, :])
-        self.timer.stop_inference_stage("autoregressive_inference")
-
-        return generated_token, kv_cache
-
-    def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
-        """
-        Samples a token from the logits using the sampling temperature.
-
-        :param logits: the logits from the model with shape (vocab_size,)
-
-        :return: the sampled token
-        """
-        if self.deterministic:
-            return numpy.argmax(logits)
-
-        logits /= self.sampling_temperature
-
-        probs = softmax(logits)
-
-        return numpy.random.choice(len(probs), p=probs)
-
-    def setup_onnx_file_path(self) -> str:
-        """
-        Parses ONNX, tokenizer, and config file paths from the given `model_path`.
-        Supports sparsezoo stubs
-
-        :return: file path to the processed ONNX file for the engine to compile
-        """
-        onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
-            self.model_path,
-            require_configs=True,
-        )
-
-        self.config = AutoConfig.from_pretrained(config_path)
-
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.model_path,
-            model_max_length=self.sequence_length,
-        )
-        self.config_path = os.path.join(config_path, "config.json")
-
-        (
-            onnx_path,
-            self.onnx_input_names,
-            self._temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            onnx_path,
-            max_length=self.sequence_length,
-            load_external_data=False,
-            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
-            custom_input_overwrite_func_kwargs=dict(
-                multitoken=False,
-                num_attention_heads=self.config.num_attention_heads,
-                hidden_dims=OPT_CACHE_HIDDEN_DIM,
-            ),
-        )
-
-        return onnx_path
-
-    def _initialize_kv_cache(self, length: int) -> Dict[str, numpy.ndarray]:
-        # initialize empty kv cache of size
-        # (num_attention_heads, length, hidden_dims)
-        empty_kv_cache_tensor = numpy.zeros(
-            (
-                self.config.num_attention_heads,
-                length,
-                OPT_CACHE_HIDDEN_DIM,
-            ),
-            dtype=numpy.float32,
-        )
-
-        cache_keys = [
-            output_name.replace("present", "past_key_values")
-            for output_name in self.engine.output_names
-            if output_name.startswith("present")
-        ]
-        return {key: empty_kv_cache_tensor for key in cache_keys}
-
-    def assemble_kv_cache(
-        self,
-        cache_values: List[numpy.ndarray],
-        tokens: List[int],
-        consider_sos_token: bool = False,
-    ) -> Dict[str, numpy.ndarray]:
-        """
-        Restructure the kv cache values from the engine output, so
-        that it can be passed to the engine in the next inference run.
-
-        KV Cache concatenation adds an extra length dimension to the output
-        cache, that should be deleted after every inference run.
-
-        There are two modes:
-        1. Some values in the cache represent dummy (pad) tokens, padding is
-            to the left, so the left-most cache value is deleted
-        2. The cache is saturated with non-dummy (meaningful) tokens:
-            -   if there is a mandatory start-of-sequence (SOS) token,
-                we delete the left-most cache value that is not a cache
-                corresponding to SOS token.
-            -   otherwise we delete from the left as in (1)
-
-        :param cache_values: the cache values from the engine output
-        :param tokens: the tokens from the previous inference run
-        :param consider_sos_token: whether to consider the SOS token in the cache
-        :return kv_cache: the restructured cache values
-        """
-        for idx, cache_value in enumerate(cache_values):
-            if len(tokens) > self.sequence_length - 1:
-                idx_to_remove = int(not consider_sos_token)
-            else:
-                idx_to_remove = 0
-
-            # TODO: see if we can do in-place
-            cache_values[idx] = numpy.delete(cache_value, idx_to_remove, 1)
-
-        cache_keys = [
-            name.replace("present", "past_key_values")
-            for name in self.engine.output_names
-            if name.startswith("present")
-        ]
-        kv_cache = dict(zip(cache_keys, cache_values))
-
-        return kv_cache
-
-    @staticmethod
-    def overwrite_onnx_model_inputs(
-        external_inputs: List[ValueInfoProto],
-        batch_size: int,
-        sequence_length: int,
-        num_attention_heads: int,
-        hidden_dims: int,
-        multitoken: bool = True,
-    ) -> List[str]:
-        """
-        Overwrite the input shape of the onnx model. This function
-        is particular for the model with inputs:
-            - input_ids
-            - attention_mask
-            - positions
-            - past_key_values (x N)
-
-        :param external_inputs: The external inputs of the onnx model
-        :param batch_size: The batch size of the input
-        :param sequence_length: The sequence length of the input
-        :param num_attention_heads: The number of attention heads
-            of the model (required to set the shape of the kv_cache)
-        :param hidden_dims: The hidden dimensions of the model
-            (required to set the shape of the kv_cache)
-        :param multitoken: A boolean flag that indicates whether
-            we are overwriting inputs to the model for multi-token
-            inference (sequence_len > 1) or single token inference
-            (sequence_len = 1).
-        :return: the input names of the onnx model
-        """
-        input_names = []
-        for external_input in external_inputs:
-            if external_input.name in ["input_ids", "positions"]:
-                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                external_input.type.tensor_type.shape.dim[1].dim_value = (
-                    sequence_length if multitoken else 1
-                )
-            elif external_input.name == "attention_mask":
-                external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-                # regardless of multi-token or not,
-                # we always provide full attention mask
-                external_input.type.tensor_type.shape.dim[1].dim_value = sequence_length
-            elif external_input.name.startswith("past_key_values"):
-                external_input.type.tensor_type.shape.dim[
-                    0
-                ].dim_value = num_attention_heads
-                # empty cache for multi-token runs,
-                # otherwise max cache len is max len - 1
-                external_input.type.tensor_type.shape.dim[1].dim_value = (
-                    0 if multitoken else sequence_length - 1
-                )
-                external_input.type.tensor_type.shape.dim[2].dim_value = hidden_dims
-            else:
-                raise ValueError(
-                    f"Unexpected external input name: {external_input.name}"
-                )
-
-            input_names.append(external_input.name)
-        return input_names
-
-    def _setup_onnx_multitoken_file_path(self) -> str:
-        (
-            onnx_path,
-            _,
-            _temp_model_directory,
-        ) = overwrite_transformer_onnx_model_inputs(
-            self.onnx_file_path,
-            max_length=self.sequence_length,
-            load_external_data=False,
-            custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
-            custom_input_overwrite_func_kwargs=dict(
-                multitoken=True,
-                num_attention_heads=self.config.num_attention_heads,
-                hidden_dims=OPT_CACHE_HIDDEN_DIM,
-            ),
-        )
-
-        return onnx_path, _temp_model_directory
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 7a8aa6d087..58ca912eeb 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -52,6 +52,7 @@
 def get_onnx_path_and_configs(
     model_path: str,
     require_configs: bool = False,
+    model_dir_onnx_name: str = _MODEL_DIR_ONNX_NAME,
 ) -> Tuple[str, Optional[str], Optional[str]]:
     """
     :param model_path: path to onnx file, transformers sparsezoo stub,
@@ -61,6 +62,7 @@ def get_onnx_path_and_configs(
     :param require_configs: if True, model_path must be a directory containing
         `model.onnx`, `config.json`, and `tokenizer.json` files. Will raise
         an exception otherwise
+    :param model_dir_onnx_name: name of onnx file in model directory
     :return: tuple of ONNX file path, parent directory of config file
         if it exists, and parent directory of tokenizer config file if it
         exists. (Parent directories returned instead of absolute path
@@ -79,9 +81,9 @@ def get_onnx_path_and_configs(
             raise ValueError(
                 f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
+                f"{os.path.join(model_path, model_dir_onnx_name)}"
             )
-        onnx_path = os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
+        onnx_path = os.path.join(model_path, model_dir_onnx_name)
 
         # attempt to read config and tokenizer from sparsezoo-like framework directory
         framework_dir = None
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 4dc8be0e6d..2a54d30711 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 from typing import Dict, List, Optional, Tuple, Type
 
 import numpy
@@ -26,9 +25,10 @@
     overwrite_transformer_onnx_model_inputs,
 )
 from deepsparse.transformers.pipelines import TransformersPipeline
-from deepsparse.utils import get_output_names
 
 
+_MODEL_DIR_ONNX_KV_CACHE_NAME = "model_kvcache.onnx"
+
 __all__ = ["TextGenerationPipeline"]
 
 
@@ -111,7 +111,6 @@ def __init__(
         )
 
         if prompt_batch_threshold is not None and prompt_batch_threshold < 1:
-            raise ValueError("multitoken engine is currently not supported")
             (
                 self.onnx_multitoken_path,
                 self._temp_model_directory,
@@ -359,6 +358,7 @@ def setup_onnx_file_path(self) -> str:
         onnx_path, config_path, tokenizer_path = get_onnx_path_and_configs(
             self.model_path,
             require_configs=True,
+            model_dir_onnx_name=_MODEL_DIR_ONNX_KV_CACHE_NAME,
         )
 
         self.config = AutoConfig.from_pretrained(config_path)
@@ -375,7 +375,6 @@ def setup_onnx_file_path(self) -> str:
         ) = overwrite_transformer_onnx_model_inputs(
             onnx_path,
             max_length=self.sequence_length,
-            load_external_data=False,
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
                 multitoken=False,
@@ -387,13 +386,16 @@ def setup_onnx_file_path(self) -> str:
     def _initialize_kv_cache(self, length: int) -> Dict[str, numpy.ndarray]:
         # initialize empty kv cache of size
         # (batch_size, num_attention_heads, length, hidden_dims)
-        raise NotImplementedError()
+
+        cache_engine_input_index = next(
+            i for i, name in enumerate(self.engine.input_names) if "past_key" in name
+        )
+        batch_size, num_attention_heads, _, hidden_dims = self.engine.input_shapes[
+            cache_engine_input_index
+        ]
+
         empty_kv_cache_tensor = numpy.zeros(
-            (
-                self.config.num_attention_heads,
-                length,
-                OPT_CACHE_HIDDEN_DIM,
-            ),
+            (batch_size, num_attention_heads, length, hidden_dims),
             dtype=numpy.float32,
         )
 
@@ -437,7 +439,7 @@ def assemble_kv_cache(
             else:
                 idx_to_remove = 0
 
-            cache_values[idx] = numpy.delete(cache_value, idx_to_remove, 1)
+            cache_values[idx] = numpy.delete(cache_value, idx_to_remove, 2)
 
         cache_keys = [
             name.replace("present", "past_key_values")
@@ -485,6 +487,7 @@ def overwrite_onnx_model_inputs(
             elif external_input.name.startswith("past_key_values"):
                 # empty cache for multi-token runs,
                 # otherwise max cache len is max len - 1
+
                 external_input.type.tensor_type.shape.dim[2].dim_value = (
                     0 if multitoken else sequence_length - 1
                 )
@@ -503,7 +506,6 @@ def _setup_onnx_multitoken_file_path(self) -> str:
         ) = overwrite_transformer_onnx_model_inputs(
             self.onnx_file_path,
             max_length=self.sequence_length,
-            load_external_data=False,
             custom_input_overwrite_func=self.overwrite_onnx_model_inputs,
             custom_input_overwrite_func_kwargs=dict(
                 multitoken=True,

From 347d1fbd3580aeef4520d07ac00e368121e11bc3 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Tue, 13 Jun 2023 12:24:26 +0000
Subject: [PATCH 49/68] remove val_inp argument

---
 .../transformers/pipelines/text_generation.py        | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 2a54d30711..0ca9eebae8 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -55,7 +55,7 @@ class TextGenerationOutput(BaseModel):
 
 @Pipeline.register(
     task="text_generation",
-    task_aliases=["codegen", "opt"],
+    task_aliases=["codegen", "opt", "bloom"],
 )
 class TextGenerationPipeline(TransformersPipeline):
     """
@@ -92,6 +92,12 @@ def __init__(
         force_max_tokens: bool = False,
         **kwargs,
     ):
+        if kwargs["engine_type"] == "deepsparse":
+            raise NotImplementedError(
+                "The text generation pipeline is not "
+                "supported for the deepsparse engine"
+            )
+
         super().__init__(**kwargs, _delay_engine_initialize=True)
 
         if self._batch_size != 1:
@@ -270,7 +276,7 @@ def prompt_inference(
                 )
         else:
             # larger prompt size, run through multi-token engine in single pass
-            logits, *cache_values = self.multitoken_engine(engine_inputs, val_inp=False)
+            logits, *cache_values = self.multitoken_engine(engine_inputs)
             kv_cache = self.assemble_kv_cache(cache_values, tokens)
             new_token = self.generate_token(logits[0, -1])
 
@@ -323,7 +329,7 @@ def autoregressive_inference(
         engine_inputs.update(kv_cache)
         engine_inputs = [engine_inputs[name] for name in self.engine.input_names]
 
-        new_logits, *cache_values = self.engine(engine_inputs, val_inp=False)
+        new_logits, *cache_values = self.engine(engine_inputs)
         kv_cache = self.assemble_kv_cache(cache_values, tokens)
 
         # Obtain the next token from the logits

From e11027c03426bc42ae1ef71560ce87fbd6184a11 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 13 Jun 2023 15:30:05 +0200
Subject: [PATCH 50/68] Update README.md

---
 src/deepsparse/transformers/README.md | 49 +++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/README.md b/src/deepsparse/transformers/README.md
index 86a1adbffe..222f5dd1bf 100644
--- a/src/deepsparse/transformers/README.md
+++ b/src/deepsparse/transformers/README.md
@@ -10,6 +10,7 @@ methods such as [pruning](https://neuralmagic.com/blog/pruning-overview/) and [q
 These techniques result in significantly more performant and smaller models with limited to no effect on the baseline metrics. 
 
 This integration currently supports several fundamental NLP tasks:
+- **Text Generation** - given the input prompt, generate an output text sequence (e.g. to fill in incomplete text or paraphrase part of the prompt)
 - **Question Answering** - posing questions about a document
 - **Sentiment Analysis** - assigning a sentiment to a piece of text
 - **Text Classification** - assigning a label or class to a piece of text (e.g duplicate question pairing)
@@ -30,10 +31,12 @@ compatible with our [hardware requirements](https://docs.neuralmagic.com/deepspa
 By default, to deploy the transformer using DeepSparse Engine it is required to supply the model in the ONNX format along with the HuggingFace supporting files. 
 This grants the engine the flexibility to serve any model in a framework-agnostic environment. 
 
-The DeepSparse pipelines require the following files within a folder on the local server to properly load a Transformers model:
+In general, the DeepSparse pipelines require the following files within a folder on the local server to properly load a Transformers model:
 - `model.onnx`: The exported Transformers model in the [ONNX format](https://github.com/onnx/onnx).
-- `tokenizer.json`: The [HuggingFace compatible tokenizer configuration](https://huggingface.co/docs/transformers/fast_tokenizers) used with the model.
+- `model_kvcache.onnx` (optional): the ONNX model with the KV Cache support (akin to the Transformers model with `use_cache = True`. Particular for `text-generation` integration.
 - `config.json`: The [HuggingFace compatible configuration file](https://huggingface.co/docs/transformers/main_classes/configuration) used with the model.
+- `tokenizer_config.json`: The [HuggingFace compatible tokenizer configuration](https://huggingface.co/docs/transformers/fast_tokenizers) used with the model.
+- `tokenizer.json`, `special_tokens_map.json`, `vocab.json`, `merges.txt` (optional): Other files that may be required by a tokenizer
 
 Below we describe two possibilities to obtain the required structure.
 
@@ -48,7 +51,7 @@ sparseml.transformers.export_onnx --task question-answering --model_path model_p
 ```
 
 This creates `model.onnx` file, in the directory of your `model_path`(e.g. `/trained_model/model.onnx`). 
-The `tokenizer.json` and `config.json` are stored under the `model_path` folder as well, so a DeepSparse pipeline ca be directly instantiated by using that folder after export (e.g. `/trained_model/`).
+Any additional, required files, such as e.g.`tokenizer.json` or `config.json`, are stored under the `model_path` folder as well, so a DeepSparse pipeline ca be directly instantiated by using that folder after export (e.g. `/trained_model/`).
 
 ####  SparseZoo Stub
 Alternatively, you can skip the process of the ONNX model export by using Neural Magic's [SparseZoo](https://sparsezoo.neuralmagic.com/). The SparseZoo contains pre-sparsified models and SparseZoo stubs enable you to reference any model on the SparseZoo in a convenient and predictable way.
@@ -137,6 +140,46 @@ response.text
 
 >> '{"score":0.9534820914268494,"start":8,"end":14,"answer":"batman"}'
 ```
+### Text Generation
+The text generation task generates a sequence of words given the prompt. Popular text generation LLMs (Large Language Models) are used
+for the chats (the instruction models), code generation, text summarization or filling out the missing text.
+are used for chats or following instructions are also covered in this task. The following example uses a sparsified text classification
+OPT model to complete the prompt
+
+[List of available SparseZoo Text Generation Models](
+https://sparsezoo.neuralmagic.com/?useCase=text_generation)
+
+#### Python Pipeline
+```python
+from deepsparse import Pipeline
+
+opt_pipeline = Pipeline.create(task="opt")
+
+inference = opt_pipeline("Who is the president of the United States?")
+
+>> 'The president of the United States is the head of the executive branch of government...'
+```
+
+#### HTTP Server
+Spinning up:
+```bash
+deepsparse.server \
+    task sentiment-analysis \
+    --model_path # TODO: Pending until text generation models get uploaded to SparseZoo
+```
+
+Making a request:
+```python
+import requests
+
+url = "http://localhost:5543/predict" # Server's port default to 5543
+
+obj = {"sequence": "Who is the president of the United States?"}
+
+response = requests.post(url, json=obj)
+response.text
+
+>> 'The president of the United States is the head of the executive branch of government...'
 
 ### Sentiment Analysis
 The sentiment analysis task takes in a sentence and classifies its sentiment. The following example

From a95091096928102b739b31aefe95f02b585ac569 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 13 Jun 2023 15:38:48 +0200
Subject: [PATCH 51/68] Apply suggestions from code review

---
 src/deepsparse/pipeline.py            | 2 +-
 src/deepsparse/transformers/README.md | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 4ffda8800d..3fac78592d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = [self.engine_forward(x) for x in batches]
+        batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/transformers/README.md b/src/deepsparse/transformers/README.md
index 222f5dd1bf..280a67cf7a 100644
--- a/src/deepsparse/transformers/README.md
+++ b/src/deepsparse/transformers/README.md
@@ -33,7 +33,7 @@ This grants the engine the flexibility to serve any model in a framework-agnosti
 
 In general, the DeepSparse pipelines require the following files within a folder on the local server to properly load a Transformers model:
 - `model.onnx`: The exported Transformers model in the [ONNX format](https://github.com/onnx/onnx).
-- `model_kvcache.onnx` (optional): the ONNX model with the KV Cache support (akin to the Transformers model with `use_cache = True`. Particular for `text-generation` integration.
+- `model_kvcache.onnx` (optional): the ONNX model with the KV Cache support (akin to the Transformers model with `use_cache = True`. Specific for the `text-generation` integration.
 - `config.json`: The [HuggingFace compatible configuration file](https://huggingface.co/docs/transformers/main_classes/configuration) used with the model.
 - `tokenizer_config.json`: The [HuggingFace compatible tokenizer configuration](https://huggingface.co/docs/transformers/fast_tokenizers) used with the model.
 - `tokenizer.json`, `special_tokens_map.json`, `vocab.json`, `merges.txt` (optional): Other files that may be required by a tokenizer
@@ -142,7 +142,7 @@ response.text
 ```
 ### Text Generation
 The text generation task generates a sequence of words given the prompt. Popular text generation LLMs (Large Language Models) are used
-for the chats (the instruction models), code generation, text summarization or filling out the missing text.
+for the chats (the instruction models), code generation, text summarization, or filling out the missing text.
 are used for chats or following instructions are also covered in this task. The following example uses a sparsified text classification
 OPT model to complete the prompt
 
@@ -164,7 +164,7 @@ inference = opt_pipeline("Who is the president of the United States?")
 Spinning up:
 ```bash
 deepsparse.server \
-    task sentiment-analysis \
+    task text-generation \
     --model_path # TODO: Pending until text generation models get uploaded to SparseZoo
 ```
 

From c1d02dc24c54945979317b33eaadcad1629ce02e Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Tue, 13 Jun 2023 15:41:37 +0200
Subject: [PATCH 52/68] Update README.md

---
 src/deepsparse/transformers/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/deepsparse/transformers/README.md b/src/deepsparse/transformers/README.md
index 280a67cf7a..185f349b16 100644
--- a/src/deepsparse/transformers/README.md
+++ b/src/deepsparse/transformers/README.md
@@ -180,6 +180,7 @@ response = requests.post(url, json=obj)
 response.text
 
 >> 'The president of the United States is the head of the executive branch of government...'
+```
 
 ### Sentiment Analysis
 The sentiment analysis task takes in a sentence and classifies its sentiment. The following example

From 2085c37247dc5c78e6f3bca4fb3dddf575d0804e Mon Sep 17 00:00:00 2001
From: Rahul Tuli <rahul@neuralmagic.com>
Date: Wed, 14 Jun 2023 16:35:00 -0400
Subject: [PATCH 53/68] [BugFix] Update deepsparse dockerfile (#1069)

* Remove autoinstall triggering commands

* Fix typo
---
 docker/Dockerfile | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 0b923db808..6551237e30 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -81,9 +81,6 @@ RUN \
       $VENV/bin/pip install --no-cache-dir "deepsparse[server,yolo,onnxruntime,yolov8,transformers,image_classification]==$VERSION"; \
     fi;
 
-RUN deepsparse.transformers.run_inference --help \
-    && deepsparse.image_classification.annotate --help
-
 
 FROM base AS container_branch_dev
 ARG VENV
@@ -97,7 +94,7 @@ RUN \
       $VENV/bin/pip install -e  "./deepsparse[dev]"; \
     else \
       echo Installing from main  with editable mode && \
-      git clone https://github.com/neuralmagic/sparseml.git --depth 1 -b main && \
+      git clone https://github.com/neuralmagic/deepsparse.git --depth 1 -b main && \
       $VENV/bin/pip install -e "./deepsparse[dev]"; \
     fi;
 

From 2f7bc9533418bb0dc653c1c7e53f7122ef22b7c4 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 15 Jun 2023 17:52:06 +0000
Subject: [PATCH 54/68] initial implementation

---
 .../transformers/eval_text_generation.py      |  82 +++++++++++
 src/deepsparse/transformers/metrics.py        | 135 ++++++++++++++++++
 2 files changed, 217 insertions(+)
 create mode 100644 src/deepsparse/transformers/eval_text_generation.py

diff --git a/src/deepsparse/transformers/eval_text_generation.py b/src/deepsparse/transformers/eval_text_generation.py
new file mode 100644
index 0000000000..e02b70b237
--- /dev/null
+++ b/src/deepsparse/transformers/eval_text_generation.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from typing import List
+
+import numpy as np
+import onnxruntime
+from transformers import AutoConfig, AutoTokenizer
+
+from deepsparse.transformers.metrics import Perplexity
+from evaluate import load
+
+
+input_text1 = "While the Indiana Jones movies likely inspired a lot of young viewers to become interested in a career in archeology, they are not very realistic depictions of the profession. The movies seem very willing to admit this with some of the more poignant Indiana Jones quotes. "
+input_text2 = "During an early scene in Indiana Jones and the Last Crusade, Indy lectures his students that a big part of being an archeologist is research. Of course, he then goes off on a thrilling adventure of treasure hunting where X does, in fact, mark the spot."
+
+
+def perplexity_eval(args: argparse.Namespace):
+    # not using the pipeline for now, since it does not support batched inputs
+    session = onnxruntime.InferenceSession(os.path.join(args.model_path, "model.onnx"))
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    config = AutoConfig.from_pretrained(args.model_path)
+
+    perplexity = Perplexity(
+        session=session,
+        tokenizer=tokenizer,
+        vocab_size=config.vocab_size,
+        static_length=args.sequence_length,
+    )
+    for input_text in args.dataset:
+        perplexity.add_batch(input_text)
+
+    return perplexity.compute()
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="Evaluate a text-generation model on a toy dataset."
+    )
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="/home/ubuntu/damian/sparseml/deployment",
+        help="Path to the model directory",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=List[str],
+        default=[input_text1, input_text2],
+        help="A list of strings to evaluate perplexity on",
+    )
+    parser.add_argument(
+        "--sequence_length",
+        type=int,
+        default=128,
+        help="Sequence length to use for evaluation",
+    )
+    args = parser.parse_args()
+    results = perplexity_eval(args)
+
+    # testing the correctness
+    perplexity = load("perplexity", module_type="metric")
+    gt_results = perplexity.compute(
+        predictions=[input_text1, input_text2], model_id="facebook/opt-350M"
+    )
+    assert np.allclose(
+        np.array(results["perplexities"]), np.array(gt_results["perplexities"])
+    )
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 407e9b9d6b..7f10dc3240 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -20,15 +20,150 @@
 from typing import Dict, Optional
 
 import numpy
+from onnxruntime import InferenceSession
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer
 
+import torch
 from sklearn.metrics import precision_recall_fscore_support
 
 
 __all__ = [
     "PrecisionRecallF1",
+    "Perplexity",
 ]
 
 
+class Perplexity:
+    def __init__(
+        self,
+        session: InferenceSession,
+        tokenizer: PreTrainedTokenizer,
+        vocab_size: int,
+        static_length: Optional[int] = None,
+    ):
+        """
+        Given the onnxruntime session, compute the perplexity of the model
+        on the given text input.
+        Session will be in future swapped for the text generation pipeline.
+
+        :param session: The onnxruntime session to use for inference
+        :param tokenizer: The tokenizer to use for tokenizing the input text
+        :param vocab_size: The size of the vocabulary for the model
+        :param static_length: The static length of the input text to use
+            for computing logits
+        """
+
+        self._session = session
+        self._tokenizer = tokenizer
+        self._vocab_size = vocab_size
+        self._static_length = static_length
+        self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+        self.encoded_batches = None  # (batch_size, self._static_length)
+        self.attention_masks = None  # (batch_size, self._static_length)
+
+    def add_batch(
+        self, input_text: str, batch_size: int = 16, add_start_token: bool = True
+    ):
+        """
+        Converts input_text into data that can be eventually used to compute perplexity.
+
+        :param input_text: The text to convert into data for computing perplexity
+        :param batch_size: The batch size to use for tokenization
+        :param add_start_token: Whether to add the start token to the input text
+        """
+
+        if add_start_token and self._static_length:
+            # leave room for <BOS> token to be added:
+            assert self._tokenizer.bos_token is not None, (
+                "Input model must already have a BOS token "
+                "if using add_start_token=True. Please use a "
+                "different model, or set add_start_token=False"
+            )
+            max_tokenized_len = self._static_length - 1
+        else:
+            max_tokenized_len = self._static_length
+
+        encodings = self._tokenizer(
+            input_text,
+            add_special_tokens=False,
+            padding="max_length",
+            max_length=max_tokenized_len,
+            return_tensors="np",
+            return_attention_mask=True,
+        )
+
+        encoded_texts = encodings["input_ids"]
+        attention_mask = encodings["attention_mask"]
+
+        for start_index in tqdm(range(0, len(encoded_texts), batch_size)):
+            end_index = min(start_index + batch_size, len(encoded_texts))
+            encoded_batch = encoded_texts[start_index:end_index]
+            attention_mask = attention_mask[start_index:end_index]
+
+            if add_start_token:
+                batch_size = encoded_batch.shape[0]
+                # make tensor same shape as encoded_batch, but with <BOS> token
+                bos_tokens = numpy.array([[self._tokenizer.bos_token_id]] * batch_size)
+                encoded_batch = numpy.concatenate([bos_tokens, encoded_batch], axis=1)
+                attention_mask = numpy.concatenate(
+                    [numpy.ones(bos_tokens.shape, dtype=numpy.int64), attention_mask],
+                    axis=1,
+                )
+        self.encoded_batches = (
+            encoded_batch
+            if self.encoded_batches is None
+            else numpy.concatenate([self.encoded_batches, encoded_batch], axis=0)
+        )
+        self.attention_masks = (
+            attention_mask
+            if self.attention_masks is None
+            else numpy.concatenate([self.attention_masks, attention_mask], axis=0)
+        )
+
+    def compute(self) -> Dict[str, float]:
+        """
+        Given the data collected by add_batch() method,
+        compute the perplexity of the model
+        """
+        perplexities = []
+        logits_batch = self._session.run(
+            ["logits"],
+            dict(input_ids=self.encoded_batches, attention_mask=self.attention_masks),
+        )[0]
+
+        for idx, (logits, labels, attention_mask) in enumerate(
+            zip(logits_batch, self.encoded_batches, self.attention_masks)
+        ):
+            # remove padding tokens
+            logits = logits[: attention_mask.sum(), :]
+            labels = labels[: attention_mask.sum()]
+            attn_mask = attention_mask[: attention_mask.sum()]
+
+            # shift logits and labels create the input and target for the loss function
+            shift_logits = logits[:-1, :]
+            shift_labels = labels[1:]
+            shift_attention_mask_batch = attn_mask[1:]
+
+            # compute perplexity for this batch
+            perplexity_batch = torch.exp(
+                (
+                    self._loss_fct(
+                        torch.tensor(shift_logits), torch.tensor(shift_labels)
+                    )
+                    * torch.tensor(shift_attention_mask_batch)
+                ).sum()
+                / torch.tensor(shift_attention_mask_batch).sum()
+            )
+
+            perplexities.append(perplexity_batch.item())
+        return {
+            "perplexities": perplexities,
+            "mean_perplexity": numpy.mean(perplexities),
+        }
+
+
 class PrecisionRecallF1:
     def __init__(self, id_to_label: Optional[Dict[int, str]] = None):
         self._id_to_label = id_to_label

From e18fab703c7422e241784605fdc0151038a697a8 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Fri, 16 Jun 2023 09:57:46 +0000
Subject: [PATCH 55/68] working implementation for pipeline input

---
 src/deepsparse/pipeline.py                    |  2 +-
 .../transformers/eval_text_generation.py      | 78 ++++++++--------
 src/deepsparse/transformers/metrics.py        | 90 ++++++++++++-------
 .../transformers/pipelines/text_generation.py | 57 ++++++++----
 4 files changed, 138 insertions(+), 89 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 3fac78592d..4ffda8800d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = list(self.executor.map(self.engine_forward, batches))
+        batch_outputs = [self.engine_forward(x) for x in batches]
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/transformers/eval_text_generation.py b/src/deepsparse/transformers/eval_text_generation.py
index e02b70b237..a1d169da58 100644
--- a/src/deepsparse/transformers/eval_text_generation.py
+++ b/src/deepsparse/transformers/eval_text_generation.py
@@ -12,36 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Example use:
+python src/deepsparse/transformers/eval_text_generation.py 
+--task opt 
+--model_path /home/ubuntu/damian/sparseml/deployment 
+--sentence "While the Indiana Jones movies likely inspired a lot of young viewers"
+--sequence_length 128 
+"""  # noqa: W291
 import argparse
-import os
-from typing import List
-
-import numpy as np
-import onnxruntime
-from transformers import AutoConfig, AutoTokenizer
+from typing import Any, Dict
 
+from deepsparse import Pipeline
 from deepsparse.transformers.metrics import Perplexity
-from evaluate import load
-
 
-input_text1 = "While the Indiana Jones movies likely inspired a lot of young viewers to become interested in a career in archeology, they are not very realistic depictions of the profession. The movies seem very willing to admit this with some of the more poignant Indiana Jones quotes. "
-input_text2 = "During an early scene in Indiana Jones and the Last Crusade, Indy lectures his students that a big part of being an archeologist is research. Of course, he then goes off on a thrilling adventure of treasure hunting where X does, in fact, mark the spot."
 
+def perplexity_eval(args: argparse.Namespace) -> Dict[str, Any]:
 
-def perplexity_eval(args: argparse.Namespace):
-    # not using the pipeline for now, since it does not support batched inputs
-    session = onnxruntime.InferenceSession(os.path.join(args.model_path, "model.onnx"))
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    config = AutoConfig.from_pretrained(args.model_path)
-
-    perplexity = Perplexity(
-        session=session,
-        tokenizer=tokenizer,
-        vocab_size=config.vocab_size,
-        static_length=args.sequence_length,
+    pipeline = Pipeline.create(
+        task=args.task,
+        model_path=args.model_path,
+        sequence_length=args.sequence_length,
+        engine_type=args.engine_type,
+        max_generated_tokens=1,
     )
-    for input_text in args.dataset:
-        perplexity.add_batch(input_text)
+
+    perplexity = Perplexity(pipeline=pipeline)
+    perplexity.add_batch(args.sentence)
 
     return perplexity.compute()
 
@@ -49,34 +46,35 @@ def perplexity_eval(args: argparse.Namespace):
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(
-        description="Evaluate a text-generation model on a toy dataset."
+        description="Evaluate perplexity of a text-generation model on a dataset."
     )
     parser.add_argument(
         "--model_path",
         type=str,
-        default="/home/ubuntu/damian/sparseml/deployment",
         help="Path to the model directory",
     )
     parser.add_argument(
-        "--dataset",
-        type=List[str],
-        default=[input_text1, input_text2],
-        help="A list of strings to evaluate perplexity on",
+        "--sentence",
+        type=str,
+        help="A sentence to evaluate perplexity on",
     )
     parser.add_argument(
         "--sequence_length",
         type=int,
-        default=128,
-        help="Sequence length to use for evaluation",
+        help="Sequence length of the pipeline to use for evaluation",
     )
-    args = parser.parse_args()
-    results = perplexity_eval(args)
-
-    # testing the correctness
-    perplexity = load("perplexity", module_type="metric")
-    gt_results = perplexity.compute(
-        predictions=[input_text1, input_text2], model_id="facebook/opt-350M"
+    parser.add_argument(
+        "--task",
+        type=str,
+        help="Task to use for evaluation",
     )
-    assert np.allclose(
-        np.array(results["perplexities"]), np.array(gt_results["perplexities"])
+    parser.add_argument(
+        "--engine_type",
+        type=str,
+        default="onnxruntime",
+        help="Engine type to use for evaluation",
     )
+
+    args = parser.parse_args()
+    results = perplexity_eval(args)
+    print(results)
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 7f10dc3240..91d8684fd1 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -17,14 +17,15 @@
 """
 
 
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 
 import numpy
-from onnxruntime import InferenceSession
 from tqdm import tqdm
-from transformers import PreTrainedTokenizer
+from transformers import AutoTokenizer
 
 import torch
+from deepsparse import Pipeline
+from deepsparse.transformers.pipelines.text_generation import TextGenerationPipeline
 from sklearn.metrics import precision_recall_fscore_support
 
 
@@ -37,27 +38,25 @@
 class Perplexity:
     def __init__(
         self,
-        session: InferenceSession,
-        tokenizer: PreTrainedTokenizer,
-        vocab_size: int,
-        static_length: Optional[int] = None,
+        pipeline: Pipeline,
     ):
         """
-        Given the onnxruntime session, compute the perplexity of the model
+        Given the pipeline, compute the perplexity of the model
         on the given text input.
-        Session will be in future swapped for the text generation pipeline.
 
-        :param session: The onnxruntime session to use for inference
-        :param tokenizer: The tokenizer to use for tokenizing the input text
-        :param vocab_size: The size of the vocabulary for the model
-        :param static_length: The static length of the input text to use
-            for computing logits
-        """
+        Code adapted from:
+        https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501
 
-        self._session = session
-        self._tokenizer = tokenizer
-        self._vocab_size = vocab_size
-        self._static_length = static_length
+        :param pipeline: The pipeline to use for text generation
+        """
+        if not isinstance(pipeline, TextGenerationPipeline):
+            raise ValueError(
+                "Perplexity can only be computed for text generation pipelines"
+            )
+        self._pipeline = pipeline
+        self._tokenizer = AutoTokenizer.from_pretrained(pipeline.model_path)
+        self._vocab_size = pipeline.config.vocab_size
+        self._static_length = pipeline.sequence_length
         self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
         self.encoded_batches = None  # (batch_size, self._static_length)
@@ -68,9 +67,12 @@ def add_batch(
     ):
         """
         Converts input_text into data that can be eventually used to compute perplexity.
+        Note: BOS token means "Begging of Sentence" token, which as
+              the same as SOS token "Start of Sentence" token.
 
         :param input_text: The text to convert into data for computing perplexity
-        :param batch_size: The batch size to use for tokenization
+        :param batch_size: The batch size to split the input text into
+         non-overlapping batches
         :param add_start_token: Whether to add the start token to the input text
         """
 
@@ -97,6 +99,7 @@ def add_batch(
         encoded_texts = encodings["input_ids"]
         attention_mask = encodings["attention_mask"]
 
+        # split input_text into non-overlapping batches of `batch_size`
         for start_index in tqdm(range(0, len(encoded_texts), batch_size)):
             end_index = min(start_index + batch_size, len(encoded_texts))
             encoded_batch = encoded_texts[start_index:end_index]
@@ -111,6 +114,7 @@ def add_batch(
                     [numpy.ones(bos_tokens.shape, dtype=numpy.int64), attention_mask],
                     axis=1,
                 )
+        # save batches in the class object's state
         self.encoded_batches = (
             encoded_batch
             if self.encoded_batches is None
@@ -122,29 +126,49 @@ def add_batch(
             else numpy.concatenate([self.attention_masks, attention_mask], axis=0)
         )
 
-    def compute(self) -> Dict[str, float]:
+    def compute(self) -> Dict[str, Any]:
         """
         Given the data collected by add_batch() method,
         compute the perplexity of the model
         """
         perplexities = []
-        logits_batch = self._session.run(
-            ["logits"],
-            dict(input_ids=self.encoded_batches, attention_mask=self.attention_masks),
-        )[0]
 
-        for idx, (logits, labels, attention_mask) in enumerate(
-            zip(logits_batch, self.encoded_batches, self.attention_masks)
+        """
+        Because we are not able to run batched inference
+        on the pipeline, we need to run inference on each
+        sequence in the batch individually.
+        In the future, once the batch support is ready,
+        we could simply run in the pipeline
+        ```
+        out = self._pipeline(sequence=func(self.encoded_batches))
+        ```
+        """
+        for idx, (encoded_batch, attention_mask) in enumerate(
+            zip(self.encoded_batches, self.attention_masks)
         ):
-            # remove padding tokens
-            logits = logits[: attention_mask.sum(), :]
-            labels = labels[: attention_mask.sum()]
-            attn_mask = attention_mask[: attention_mask.sum()]
+            batch_logits = []
+            tokens = encoded_batch.tolist()[: attention_mask.sum()]
+            batch_sequences = [
+                self._tokenizer.decode(tokens[:i], skip_special_tokens=True)
+                for i in range(1, len(tokens) + 1)
+            ]
+            for sequence in batch_sequences:
+                # cannot do it in batch, we need to run
+                # p(x_i | x_1, ..., x_{i-1}) for each i
+                out = self._pipeline(sequence=sequence, return_logits=True)
+                batch_logits.append(out.logits)
+
+            logits = numpy.concatenate(batch_logits, axis=1)[0]
+
+            # extract only the meaningful info from the
+            # data that assumes static length
+            labels = encoded_batch[: attention_mask.sum()]
+            attention_mask = attention_mask[: attention_mask.sum()]
 
             # shift logits and labels create the input and target for the loss function
-            shift_logits = logits[:-1, :]
+            shift_logits = logits[:-1]
             shift_labels = labels[1:]
-            shift_attention_mask_batch = attn_mask[1:]
+            shift_attention_mask_batch = attention_mask[1:]
 
             # compute perplexity for this batch
             perplexity_batch = torch.exp(
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 0ca9eebae8..c4bd1b69ee 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -45,12 +45,26 @@ class TextGenerationInput(BaseModel):
     sequence: str = Field(
         description="The input sequence to generate the text from.",
     )
+    return_logits: bool = Field(
+        default=False,
+        description="A flag that indicates whether to return "
+        "the logits for the generated text sequence. ",
+    )
 
 
 class TextGenerationOutput(BaseModel):
     sequence: str = Field(
         description="The generated text sequence.",
     )
+    logits: Optional[numpy.ndarray] = Field(
+        default=None,
+        description="The logits for the generated text sequence."
+        "The logits have dimensions "
+        "[batch_size, sequence_length, vocab_size]",
+    )
+
+    class Config:
+        arbitrary_types_allowed = True
 
 
 @Pipeline.register(
@@ -195,7 +209,9 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         input_tokens = {**input_tokens, **kv_cache, **positions_input}
         engine_input = self.tokens_to_engine_input(input_tokens)
 
-        return engine_input
+        postprocess_kwargs = dict(return_logits=inputs.return_logits)
+
+        return engine_input, postprocess_kwargs
 
     def process_engine_outputs(
         self, engine_outputs: List[numpy.ndarray], **kwargs
@@ -206,21 +222,27 @@ def process_engine_outputs(
         :param engine_outputs: the outputs from the engine
         :return: the output schema for the pipeline
         """
-        sequence = self.tokenizer.decode(engine_outputs[0][0], skip_special_tokens=True)
-        return TextGenerationOutput(sequence=sequence)
+        generated_tokens, generated_logits = engine_outputs
+        sequence = self.tokenizer.decode(
+            generated_tokens[0][0], skip_special_tokens=True
+        )
+        logits = generated_logits if kwargs.get("return_logits") else None
+
+        return TextGenerationOutput(sequence=sequence, logits=logits)
 
     def engine_forward(
         self, engine_inputs: List[numpy.ndarray], **kwargs
-    ) -> numpy.ndarray:
+    ) -> Tuple[numpy.ndarray, numpy.ndarray]:
         """
         Run the forward pass on the engine.
 
         :param engine_inputs: list of numpy inputs to
             Pipeline engine forward pass
         :return: A numpy array that contains the tokens generated by the model
+                 and a numpy array that contains the logits generated by the model
         """
         # run the prompt through
-        tokens, kv_cache = self.prompt_inference(engine_inputs)
+        tokens, new_logit, kv_cache = self.prompt_inference(engine_inputs)
         num_prompt_tokens = len(tokens) - 1
 
         # create the generated output
@@ -229,23 +251,26 @@ def engine_forward(
             if self.max_generated_tokens and self.max_generated_tokens > 0
             else 100 * self.sequence_length
         )  # set safety for absolute max generation
+
         generated = [tokens[-1]]
+        logits = [new_logit]
 
         while len(generated) < max_tokens:
-            gen_token, kv_cache = self.autoregressive_inference(
+            gen_token, gen_logit, kv_cache = self.autoregressive_inference(
                 tokens, kv_cache, num_prompt_tokens
             )
             tokens.append(gen_token)
             generated.append(gen_token)
+            logits.append(gen_logit)
 
             if gen_token == self.tokenizer.eos_token_id and not self.force_max_tokens:
                 break
 
-        return numpy.array([[generated]])
+        return numpy.array([[generated]]), numpy.concatenate(logits, axis=1)
 
     def prompt_inference(
         self, engine_inputs: List[numpy.ndarray]
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+    ) -> Tuple[List[int], numpy.ndarray, Dict[str, numpy.ndarray]]:
         """
         An inference run that processes the prompt through the
         model to generate the new token and populate the kv cache.
@@ -254,6 +279,7 @@ def prompt_inference(
             list of numpy inputs to the engine
         :return:
             - the list of prompt tokens plus the new, generated token
+            - the logits for the new token
             - the kv cache that was populated during the inference
         """
         # get tokens by attention mask
@@ -271,25 +297,26 @@ def prompt_inference(
             kv_cache = {}
             for token in tokens:
                 run_tokens.append(token)
-                new_token, kv_cache = self.autoregressive_inference(
+                new_token, new_logits, kv_cache = self.autoregressive_inference(
                     run_tokens, kv_cache, num_prompt_tokens=0
                 )
         else:
             # larger prompt size, run through multi-token engine in single pass
-            logits, *cache_values = self.multitoken_engine(engine_inputs)
+            logits_sequence, *cache_values = self.multitoken_engine(engine_inputs)
             kv_cache = self.assemble_kv_cache(cache_values, tokens)
-            new_token = self.generate_token(logits[0, -1])
+            new_logits = logits_sequence[0, -1]
+            new_token = self.generate_token(new_logits)
 
         tokens.append(new_token)
 
-        return tokens, kv_cache
+        return tokens, new_logits, kv_cache
 
     def autoregressive_inference(
         self,
         tokens: List[int],
         kv_cache: Dict[str, numpy.ndarray],
         num_prompt_tokens: int,
-    ) -> Tuple[int, Dict[str, numpy.ndarray]]:
+    ) -> Tuple[int, numpy.ndarray, Dict[str, numpy.ndarray]]:
         """
         An inference run that processes the last token and the kv cache to
         generate a new token and update the kv cache.
@@ -299,6 +326,7 @@ def autoregressive_inference(
         :param num_prompt_tokens: number of tokens in the initial prompt
         :return:
             - the new, generated token
+            - the logits for the new token
             - the kv cache that was populated during the inference
         """
         new_token = tokens[-1]
@@ -334,8 +362,7 @@ def autoregressive_inference(
 
         # Obtain the next token from the logits
         generated_token = self.generate_token(new_logits[0, 0, :])
-
-        return generated_token, kv_cache
+        return generated_token, new_logits, kv_cache
 
     def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
         """

From 0358d878a13c616f8ba424f54471d230f6d8c50e Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Thu, 15 Jun 2023 16:35:24 +0200
Subject: [PATCH 56/68] [Fix] Fix CLI benchmark errors (#1071)

* initial commit

* ready for review

* Update src/deepsparse/utils/onnx.py
---
 src/deepsparse/utils/onnx.py       | 28 +++++++-----
 tests/deepsparse/utils/__init__.py | 13 ++++++
 tests/deepsparse/utils/onnx.py     | 71 ++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 10 deletions(-)
 create mode 100644 tests/deepsparse/utils/__init__.py
 create mode 100644 tests/deepsparse/utils/onnx.py

diff --git a/src/deepsparse/utils/onnx.py b/src/deepsparse/utils/onnx.py
index 5ce98ce6a0..a73cddc24b 100644
--- a/src/deepsparse/utils/onnx.py
+++ b/src/deepsparse/utils/onnx.py
@@ -60,15 +60,16 @@ def save_onnx_to_temp_files(model: onnx.ModelProto, with_external_data=False) ->
     :param model: The onnx model to save to temporary directory
     :param with_external_data: Whether to save external data to a separate file
     """
-    shaped_model = tempfile.NamedTemporaryFile(mode="w", delete=False)
-    _LOGGER.info(f"Saving model to temporary directory: {tempfile.tempdir}")
+
+    shaped_model = tempfile.NamedTemporaryFile(suffix=".onnx", delete=False, mode="w")
+    _LOGGER.info(f"Saving model to temporary file: {shaped_model.name}")
 
     if with_external_data:
-        external_data = os.path.join(
-            tempfile.tempdir, next(tempfile._get_candidate_names())
+        external_data = tempfile.NamedTemporaryFile(
+            suffix=".data", delete=False, mode="w"
         )
-        has_external_data = save_onnx(model, shaped_model.name, external_data)
-        _LOGGER.info(f"Saving external data to temporary directory: {external_data}")
+        _LOGGER.info(f"Saving external data to temporary file: {external_data.name}")
+        has_external_data = save_onnx(model, shaped_model.name, external_data.name)
     else:
         has_external_data = save_onnx(model, shaped_model.name)
     try:
@@ -236,9 +237,13 @@ def override_onnx_batch_size(
         save_onnx(model, onnx_filepath)
         yield onnx_filepath
     else:
-        return save_onnx_to_temp_files(model, with_external_data=not inplace)
+        with save_onnx_to_temp_files(
+            model, with_external_data=not inplace
+        ) as temp_file:
+            yield temp_file
 
 
+@contextlib.contextmanager
 def override_onnx_input_shapes(
     onnx_filepath: str,
     input_shapes: Union[List[int], List[List[int]]],
@@ -300,16 +305,19 @@ def override_onnx_input_shapes(
 
     if inplace:
         _LOGGER.info(
-            "Overwriting in-place the input shapes of the model " f"at {onnx_filepath}"
+            f"Overwriting in-place the input shapes of the model at {onnx_filepath}"
         )
         onnx.save(model, onnx_filepath)
-        return onnx_filepath
+        yield onnx_filepath
     else:
         _LOGGER.info(
             f"Saving the input shapes of the model at {onnx_filepath} "
             f"to a temporary file"
         )
-        return save_onnx_to_temp_files(model, with_external_data=not inplace)
+        with save_onnx_to_temp_files(
+            model, with_external_data=not inplace
+        ) as temp_file:
+            yield temp_file
 
 
 def truncate_onnx_model(
diff --git a/tests/deepsparse/utils/__init__.py b/tests/deepsparse/utils/__init__.py
new file mode 100644
index 0000000000..0c44f887a4
--- /dev/null
+++ b/tests/deepsparse/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/deepsparse/utils/onnx.py b/tests/deepsparse/utils/onnx.py
new file mode 100644
index 0000000000..34b95c9a47
--- /dev/null
+++ b/tests/deepsparse/utils/onnx.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import onnx
+
+import pytest
+from deepsparse.utils.onnx import override_onnx_batch_size, override_onnx_input_shapes
+from sparsezoo import Model
+
+
+@pytest.mark.parametrize(
+    "test_model, batch_size",
+    [
+        (
+            "zoo:cv/classification/mobilenet_v1-1.0/pytorch/sparseml/imagenet/base-none",  # noqa: E501
+            10,
+        )
+    ],
+    scope="function",
+)
+@pytest.mark.parametrize("inplace", [True, False], scope="function")
+def test_override_onnx_batch_size(test_model, batch_size, inplace):
+    onnx_file_path = Model(test_model).onnx_model.path
+    # Override the batch size of the ONNX model
+    with override_onnx_batch_size(
+        onnx_file_path, batch_size, inplace=inplace
+    ) as modified_model_path:
+        # Load the modified ONNX model
+        modified_model = onnx.load(modified_model_path)
+        assert (
+            modified_model.graph.input[0].type.tensor_type.shape.dim[0].dim_value
+            == batch_size
+        )
+
+
+@pytest.mark.parametrize(
+    "test_model, input_shapes",
+    [
+        (
+            "zoo:cv/classification/mobilenet_v1-1.0/pytorch/sparseml/imagenet/base-none",  # noqa: E501
+            [10, 224, 224, 3],
+        )
+    ],
+    scope="function",
+)
+@pytest.mark.parametrize("inplace", [True, False], scope="function")
+def test_override_onnx_input_shapes(test_model, input_shapes, inplace):
+    onnx_file_path = Model(test_model).onnx_model.path
+    # Override the batch size of the ONNX model
+    with override_onnx_input_shapes(
+        onnx_file_path, input_shapes, inplace=inplace
+    ) as modified_model_path:
+        # Load the modified ONNX model
+        modified_model = onnx.load(modified_model_path)
+        new_input_shapes = [
+            dim.dim_value
+            for dim in modified_model.graph.input[0].type.tensor_type.shape.dim
+        ]
+        assert new_input_shapes == input_shapes

From 63b116bf4d6d1c2d7366ffb8c30ffd92d452109a Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Fri, 16 Jun 2023 12:01:43 +0200
Subject: [PATCH 57/68] Clean a typo in the pipeline code

---
 src/deepsparse/pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 4ffda8800d..3fac78592d 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -263,7 +263,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
         batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
         # submit split batches to engine threadpool
-        batch_outputs = [self.engine_forward(x) for x in batches]
+        batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
         # join together the batches of size `self._batch_size`
         engine_outputs = self.join_engine_outputs(batch_outputs)

From 79251e6827fe262fe190eb82c5de15b7c3a17ab4 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 29 Jun 2023 11:10:57 +0000
Subject: [PATCH 58/68] cleanup the old files

---
 src/deepsparse/benchmark/ort_engine.py        |   6 +-
 src/deepsparse/engine.py                      |   6 +-
 src/deepsparse/pipeline.py                    |   2 +-
 .../transformers/eval_downstream.py           |  30 +++--
 .../transformers/eval_text_generation.py      | 100 --------------
 src/deepsparse/transformers/helpers.py        |  33 ++---
 src/deepsparse/transformers/metrics.py        | 125 ++++++------------
 .../transformers/pipelines/text_generation.py |  46 +++----
 8 files changed, 87 insertions(+), 261 deletions(-)
 delete mode 100644 src/deepsparse/transformers/eval_text_generation.py

diff --git a/src/deepsparse/benchmark/ort_engine.py b/src/deepsparse/benchmark/ort_engine.py
index 4a340d326b..d2d61e83a1 100644
--- a/src/deepsparse/benchmark/ort_engine.py
+++ b/src/deepsparse/benchmark/ort_engine.py
@@ -120,10 +120,10 @@ def __init__(
         # cleaned up once we pass the loaded ONNX around and not paths
         if self._input_shapes:
             with override_onnx_input_shapes(
-                self._model_path, self._input_shapes, inplace=True
+                self._model_path, self._input_shapes
             ) as input_override_model_path:
                 with override_onnx_batch_size(
-                    input_override_model_path, batch_size, inplace=True
+                    input_override_model_path, batch_size
                 ) as batch_override_model_path:
                     self._eng_net = onnxruntime.InferenceSession(
                         batch_override_model_path,
@@ -132,7 +132,7 @@ def __init__(
                     )
         else:
             with override_onnx_batch_size(
-                self._model_path, batch_size, inplace=True
+                self._model_path, batch_size
             ) as batch_override_model_path:
                 self._eng_net = onnxruntime.InferenceSession(
                     batch_override_model_path,
diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
index 359399bae5..3e87d18efe 100644
--- a/src/deepsparse/engine.py
+++ b/src/deepsparse/engine.py
@@ -746,7 +746,7 @@ def __init__(
 
         if self._input_shapes:
             with override_onnx_input_shapes(
-                self._model_path, self._input_shapes, inplace=True
+                self._model_path, self._input_shapes
             ) as model_path:
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
@@ -834,9 +834,7 @@ def __init__(
 
         if self._input_shapes:
             with override_onnx_input_shapes(
-                self._model_path,
-                self._input_shapes,
-                inplace=True,
+                self._model_path, self._input_shapes
             ) as model_path:
                 self._eng_net = LIB.deepsparse_engine(
                     model_path,
diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 244101a9e0..34badf3f5b 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -261,7 +261,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
             batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
             # submit split batches to engine threadpool
-            batch_outputs = [self.engine_forward(x) for x in batches]
+            batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
             # join together the batches of size `self._batch_size`
             engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index c56a1de7b2..23af9770bc 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -79,25 +79,30 @@
 
 
 def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
-    dataset = load_dataset(dataset_name)["validation"]
-    perplexity_metrics = Perplexity.get_metrics()
+    dataset = load_dataset(dataset_name)["test"]
 
     text_generation = Pipeline.create(
         task="text-generation",
         model_path=args.model_path,
-        engine_type=args.engine,
+        # TODO: make sure this also works for deepsparse engine
+        engine_type="onnxruntime",
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
         prompt_processing_sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
+        tokenizer_padding_side="right",
     )
+    perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size)
+    # TODO: text_generation.engine is None
     print(f"Engine info: {text_generation.engine}")
     predictions = []
     for idx, sample in _enumerate_progress(dataset, args.max_samples):
-        text_batch.append(sample["prompt"] + sample["canonical_solution"])
-        if len(text_batch) == batch_size:
-            perplexity_metrics.add_batch(predictions, batch_size=batch_size)
-            text_batch = []
+        predictions.append(sample["prompt"] + sample["canonical_solution"])
+        if len(predictions) == batch_size:
+            perplexity_metrics.add_batch(predictions)
+            predictions = []
+        if idx == 32:
+            break
     return perplexity_metrics
 
 
@@ -466,16 +471,20 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
     "imdb": imdb_eval,
     "conll2003": conll2003_eval,
     "go_emotions": go_emotions_eval,
+    "openai_humaneval": perplexity_eval,
 }
 
 
 def parse_args():
     parser = argparse.ArgumentParser(
+        # TODO: Not BERT anymore
         description="Evaluate a BERT ONNX model on a downstream dataset"
     )
     parser.add_argument(
-        "model_path",
+        "-m",
+        "--model_path",
         type=str,
+        default="/home/ubuntu/damian/sparseml/deployment",
         help=(
             "The path to a directory containing model.onnx, config.json, and "
             "tokenizer.json files or SparseZoo stub to the model"
@@ -485,8 +494,7 @@ def parse_args():
         "-d",
         "--dataset",
         type=str,
-        choices=list(SUPPORTED_DATASETS.keys()),
-        required=True,
+        default="openai_humaneval",
     )
     parser.add_argument(
         "-v",
@@ -539,7 +547,7 @@ def parse_args():
         "--max-samples",
         help="the max number of samples to evaluate. Default is None or all samples",
         type=int,
-        default=None,
+        default=32,
     )
 
     parser.add_argument(
diff --git a/src/deepsparse/transformers/eval_text_generation.py b/src/deepsparse/transformers/eval_text_generation.py
deleted file mode 100644
index 6d98722a6b..0000000000
--- a/src/deepsparse/transformers/eval_text_generation.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Example use:
-python src/deepsparse/transformers/eval_text_generation.py 
---task opt 
---model_path /home/ubuntu/damian/sparseml/deployment 
---sentence "While the Indiana Jones movies likely inspired a lot of young viewers"
---sequence_length 128 
-"""  # noqa: W291
-import argparse
-from typing import Any, Dict
-
-from datasets import load_dataset
-from deepsparse import Pipeline
-from deepsparse.transformers.metrics import Perplexity
-from evaluate import load
-
-
-def perplexity_eval(args: argparse.Namespace) -> Dict[str, Any]:
-
-    pipeline = Pipeline.create(
-        task=args.task,
-        model_path=args.model_path,
-        sequence_length=args.sequence_length,
-        prompt_processing_sequence_length=args.sequence_length,
-        engine_type=args.engine_type,
-        max_generated_tokens=1,
-    )
-    perplexity = Perplexity(pipeline=pipeline)
-    dataset = load_dataset(args.dataset_name, split="test")
-    _perplexity = load("perplexity", module_type="metric")
-    texts = []
-    for idx, sample in enumerate(dataset):
-        text = sample["prompt"] + sample["canonical_solution"]
-        texts.append(text)
-        if idx == 2:
-            break
-    _result = _perplexity.compute(predictions=texts, model_id="facebook/opt-350m")
-    perplexity.add_batch(texts, batch_size=16)
-    result = perplexity.compute()
-    return result
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser(
-        description="Evaluate perplexity of a text-generation model on a dataset."
-    )
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default="/home/ubuntu/damian/sparseml/deployment",
-        help="Path to the model directory",
-    )
-    parser.add_argument(
-        "--sentence",
-        type=str,
-        help="A sentence to evaluate perplexity on",
-    )
-    parser.add_argument(
-        "--sequence_length",
-        type=int,
-        default=1024,
-        help="Sequence length of the pipeline to use for evaluation",
-    )
-    parser.add_argument(
-        "--task",
-        type=str,
-        default="codegen",
-        help="Task to use for evaluation",
-    )
-    parser.add_argument(
-        "--engine_type",
-        type=str,
-        default="onnxruntime",
-        help="Engine type to use for evaluation",
-    )
-    parser.add_argument(
-        "--dataset_name",
-        type=str,
-        default="openai_humaneval",
-        help="Dataset name to use for evaluation",
-    )
-
-    args = parser.parse_args()
-    results = perplexity_eval(args)
-    print(results)
diff --git a/src/deepsparse/transformers/helpers.py b/src/deepsparse/transformers/helpers.py
index 5154ada014..c951d232c8 100644
--- a/src/deepsparse/transformers/helpers.py
+++ b/src/deepsparse/transformers/helpers.py
@@ -21,7 +21,7 @@
 import re
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import numpy
 import onnx
@@ -52,7 +52,6 @@
 def get_onnx_path_and_configs(
     model_path: str,
     require_configs: bool = False,
-    model_dir_onnx_name: str = _MODEL_DIR_ONNX_NAME,
 ) -> Tuple[str, Optional[str], Optional[str]]:
     """
     :param model_path: path to onnx file, transformers sparsezoo stub,
@@ -62,7 +61,6 @@ def get_onnx_path_and_configs(
     :param require_configs: if True, model_path must be a directory containing
         `model.onnx`, `config.json`, and `tokenizer.json` files. Will raise
         an exception otherwise
-    :param model_dir_onnx_name: name of onnx file in model directory
     :return: tuple of ONNX file path, parent directory of config file
         if it exists, and parent directory of tokenizer config file if it
         exists. (Parent directories returned instead of absolute path
@@ -81,9 +79,9 @@ def get_onnx_path_and_configs(
             raise ValueError(
                 f"{_MODEL_DIR_ONNX_NAME} not found in transformers model directory "
                 f"{model_path}. Be sure that an export of the model is written to "
-                f"{os.path.join(model_path, model_dir_onnx_name)}"
+                f"{os.path.join(model_path, _MODEL_DIR_ONNX_NAME)}"
             )
-        onnx_path = os.path.join(model_path, model_dir_onnx_name)
+        onnx_path = os.path.join(model_path, _MODEL_DIR_ONNX_NAME)
 
         # attempt to read config and tokenizer from sparsezoo-like framework directory
         framework_dir = None
@@ -141,8 +139,6 @@ def overwrite_transformer_onnx_model_inputs(
     batch_size: int = 1,
     max_length: int = 128,
     inplace: bool = True,
-    custom_input_overwrite_func: Optional[Callable] = None,
-    custom_input_overwrite_func_kwargs: Optional[Dict[str, Any]] = None,
 ) -> Tuple[Optional[str], List[str], Optional[NamedTemporaryFile]]:
     """
     Overrides an ONNX model's inputs to have the given batch size and sequence lengths.
@@ -155,10 +151,6 @@ def overwrite_transformer_onnx_model_inputs(
     :param inplace: if True, the model will be modified in place (its inputs will
         be overwritten). Else, a copy of that model, with overwritten inputs,
         will be saved to a temporary file
-    :param custom_input_overwrite_func: if provided, this function will be called
-        instead of the default input overwrite function. This function should take
-         in a list of external inputs and return a list of the overwritten input names
-    :param custom_input_overwrite_func_kwargs: kwargs for the custom overwrite function
     :return: tuple of (path to the overwritten model, list of input names that were
         overwritten, and a temporary file containing the overwritten model if
         `inplace=False`, else None)
@@ -171,20 +163,11 @@ def overwrite_transformer_onnx_model_inputs(
     external_inputs = [
         inp for inp in model.graph.input if inp.name not in initializer_input_names
     ]
-    if custom_input_overwrite_func is not None:
-        custom_input_overwrite_func_kwargs = custom_input_overwrite_func_kwargs or {}
-        input_names = custom_input_overwrite_func(
-            external_inputs,
-            batch_size,
-            max_length,
-            **custom_input_overwrite_func_kwargs,
-        )
-    else:
-        input_names = []
-        for external_input in external_inputs:
-            external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
-            external_input.type.tensor_type.shape.dim[1].dim_value = max_length
-            input_names.append(external_input.name)
+    input_names = []
+    for external_input in external_inputs:
+        external_input.type.tensor_type.shape.dim[0].dim_value = batch_size
+        external_input.type.tensor_type.shape.dim[1].dim_value = max_length
+        input_names.append(external_input.name)
 
     # Save modified model
     if inplace:
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 9ad14e7efb..ffce0dbb98 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -21,7 +21,6 @@
 
 import numpy
 from tqdm import tqdm
-from transformers import AutoTokenizer
 
 import torch
 from deepsparse import Pipeline
@@ -36,10 +35,7 @@
 
 
 class Perplexity:
-    def __init__(
-        self,
-        pipeline: Pipeline,
-    ):
+    def __init__(self, pipeline: Pipeline, batch_size: int = 16):
         """
         Given the pipeline, compute the perplexity of the model
         on the given text input.
@@ -48,119 +44,72 @@ def __init__(
         https://huggingface.co/spaces/evaluate-metric/perplexity/blob/main/perplexity.py # noqa: E501
 
         :param pipeline: The pipeline to use for text generation
+        :param batch_size: The batch size to split the input text into
+         non-overlapping batches
         """
         if not isinstance(pipeline, TextGenerationPipeline):
             raise ValueError(
                 "Perplexity can only be computed for text generation pipelines"
             )
         self._pipeline = pipeline
-        self._tokenizer = AutoTokenizer.from_pretrained(pipeline.model_path)
-        self._vocab_size = pipeline.config.vocab_size
-        self._static_length = pipeline.sequence_length
+        self._batch_size = batch_size
+        self._sequence_length = pipeline.sequence_length
         self._loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-        self.encoded_batches = None  # (batch_size, self._static_length)
-        self.attention_masks = None  # (batch_size, self._static_length)
+        self.perplexities = []
 
-    def add_batch(
-        self, predictions: List[str], batch_size: int = 16, add_start_token: bool = True
-    ):
+    def add_batch(self, predictions: List[str]):
         """
-        Converts input_text into data that can be eventually used to compute perplexity.
-        Note: BOS token means "Beginning of Sentence" token, which as
-              the same as SOS token "Start of Sentence" token.
+        Run the model on the given input sequences and compute the perplexity.
+        The resulting perplexity is appended to the list of perplexities.
 
         :param predictions: The predictions to compute perplexity on
-        :param batch_size: The batch size to split the input text into
-         non-overlapping batches
-        :param add_start_token: Whether to add the start token to the input text
         """
-
-        self._tokenizer.pad_token = self._tokenizer.eos_token
-        # tokenize list of strings
-
-        encodings = self._tokenizer(
+        # tokenize the input text
+        encodings = self._pipeline.tokenizer(
             predictions,
             return_attention_mask=True,
-            max_length=self._static_length,
+            max_length=self._sequence_length,
             truncation=True,
             padding="max_length",
         )
-        # undo what this tokenizer does
 
         encoded_texts = encodings["input_ids"]
         attention_masks = encodings["attention_mask"]
 
         # split input_text into non-overlapping batches of `batch_size`
-        for start_index in tqdm(range(0, len(encoded_texts), batch_size)):
-            end_index = min(start_index + batch_size, len(encoded_texts))
+        for start_index in tqdm(range(0, len(encoded_texts), self._batch_size)):
+            end_index = min(start_index + self._batch_size, len(encoded_texts))
             encoded_batch = encoded_texts[start_index:end_index]
             attention_mask = attention_masks[start_index:end_index]
 
-            # save batches in the class object's state
-            self.encoded_batches = (
-                encoded_batch
-                if self.encoded_batches is None
-                else numpy.concatenate([self.encoded_batches, encoded_batch], axis=0)
-            )
-            self.attention_masks = (
-                attention_mask
-                if self.attention_masks is None
-                else numpy.concatenate([self.attention_masks, attention_mask], axis=0)
+            out = self._pipeline(sequences=predictions, return_logits=True)
+            logits = out.logits
+
+            labels = encoded_batch
+
+            # shift logits and labels create the input and target for the loss function
+            shift_logits = logits[:, :-1, :]
+            shift_labels = numpy.stack(labels)[:, 1:]
+            shift_attention_mask_batch = numpy.stack(attention_mask)[:, 1:]
+
+            # compute perplexity for this batch
+            perplexity_batch = torch.exp(
+                (
+                    self._loss_fct(
+                        torch.tensor(shift_logits.transpose(0, 2, 1)),
+                        torch.tensor(shift_labels),
+                    )
+                    * torch.tensor(shift_attention_mask_batch)
+                ).sum(1)
+                / torch.tensor(shift_attention_mask_batch).sum(1)
             )
+            self.perplexities.extend(perplexity_batch.numpy().tolist())
 
     def compute(self) -> Dict[str, Any]:
-        """
-        Given the data collected by add_batch() method,
-        compute the perplexity of the model
-        """
-        perplexities = []
-
-        """
-        Because we are not able to run batched inference
-        on the pipeline, we need to run inference on each
-        sequence in the batch individually.
-        In the future, once the batch support is ready,
-        we could simply run in the pipeline
-        ```
-        out = self._pipeline(sequence=func(self.encoded_batches))
-        ```
-        """
-        out = self._pipeline(
-            input_ids_and_masks=(
-                numpy.stack(self.encoded_batches),
-                numpy.stack(self.attention_masks),
-            ),
-            return_logits=True,
-        )
-        logits = out.logits
-
-        labels = self.encoded_batches
-
-        # shift logits and labels create the input and target for the loss function
-        shift_logits = logits[:, :-1, :]
-        shift_labels = numpy.stack(labels)[
-            :, 1:
-        ]  # (batch_size - 1, self._static_length)
-        shift_attention_mask_batch = numpy.stack(self.attention_masks)[
-            :, 1:
-        ]  # (batch_size - 1, self._static_length)
-
-        # compute perplexity for this batch
-        perplexity_batch = torch.exp(
-            (
-                self._loss_fct(
-                    torch.tensor(shift_logits.transpose(0, 2, 1)),
-                    torch.tensor(shift_labels),
-                )
-                * torch.tensor(shift_attention_mask_batch)
-            ).sum(1)
-            / torch.tensor(shift_attention_mask_batch).sum(1)
-        )
-
         return {
-            "perplexities": perplexity_batch.numpy().tolist(),
-            "mean_perplexity": perplexity_batch.mean().item(),
+            "mean_perplexity": numpy.mean(self.perplexities),
+            "perplexities": self.perplexities,
         }
 
 
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index c2b16acec3..e25cea10e3 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -27,16 +27,9 @@
 
 
 class TextGenerationInput(BaseModel):
-    sequences: Optional[Union[str, List[str]]] = Field(
-        default=None,
+    sequences: Union[str, List[str]] = Field(
         description="The input sequences to generate the text from.",
     )
-    input_ids_and_masks: Optional[Tuple[numpy.ndarray, numpy.ndarray]] = Field(
-        default=None,
-        description="The input ids and masks for the input sequences. "
-        "If None, the input ids and masks will be "
-        "generated from the input sequences.",
-    )
     return_logits: bool = Field(
         default=False,
         description="A flag that indicates whether to return "
@@ -50,9 +43,6 @@ class TextGenerationInput(BaseModel):
         "will be set to a random uuid.",
     )
 
-    class Config:
-        arbitrary_types_allowed = True
-
 
 class TextGenerationOutput(BaseModel):
     sequences: Union[str, List[str]] = Field(
@@ -99,6 +89,8 @@ class TextGenerationPipeline(TransformersPipeline):
         of tokens supplied even if the stop token is reached.
     :param use_deepsparse_cache: if True, the pipeline will use the deepsparse kv cache
         for caching the model outputs.
+    :param tokenizer_padding_side: the side to pad the input sequence to.
+        Either "left" or "right". Defaults to "left".
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
 
@@ -111,6 +103,7 @@ def __init__(
         prompt_processing_sequence_length: int = 128,
         force_max_tokens: bool = False,
         use_deepsparse_cache: bool = False,
+        tokenizer_padding_side: str = "left",
         **kwargs,
     ):
         if use_deepsparse_cache:
@@ -136,8 +129,7 @@ def __init__(
         self.prompt_processing_sequence_length = prompt_processing_sequence_length
         self.force_max_tokens = force_max_tokens
 
-        # override tokenizer to pad to left
-        self.tokenizer.padding_side = "right"
+        self.tokenizer.padding_side = tokenizer_padding_side
 
         self.engine = None
         self.multitoken_engine = NLDecoderEngine(
@@ -209,23 +201,17 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         :param inputs: the input schema for the pipeline
         :return: the inputs for the engine
         """
-        self.tokenizer.pad_token = self.tokenizer.eos_token
 
-        if inputs.input_ids_and_masks:
-            input_tokens = dict(
-                input_ids=inputs.input_ids_and_masks[0],
-                attention_mask=inputs.input_ids_and_masks[1],
-            )
+        self.tokenizer.pad_token = self.tokenizer.eos_token
 
-        else:
-            input_tokens = self.tokenizer(
-                inputs.sequences,
-                return_tensors="np",
-                max_length=self.sequence_length,
-                padding="max_length",
-                truncation=True,
-            )
-        # undo what this tokenizer does
+        input_tokens = self.tokenizer(
+            inputs.sequences,
+            return_tensors="np",
+            max_length=self.sequence_length,
+            padding="max_length",
+            # TODO: Truncating by default may be a problem
+            truncation=True,
+        )
 
         attention_mask = input_tokens["attention_mask"]
 
@@ -258,7 +244,9 @@ def process_engine_outputs(
         """
         generated_tokens, generated_logits = engine_outputs
         sequences = self.tokenizer.batch_decode(
-            generated_tokens[0], skip_special_tokens=True
+            # TODO: hack for now, make it general
+            *generated_tokens[0],
+            skip_special_tokens=True,
         )
         logits = generated_logits if kwargs.get("return_logits") else None
 

From 9efbdb6d038c08092a6354abfd23961079a21021 Mon Sep 17 00:00:00 2001
From: dbogunowicz <97082108+dbogunowicz@users.noreply.github.com>
Date: Thu, 29 Jun 2023 13:11:42 +0200
Subject: [PATCH 59/68] Update
 src/deepsparse/transformers/engines/nl_decoder_engine.py

---
 src/deepsparse/transformers/engines/nl_decoder_engine.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index a69d5e22ae..cd778e53e4 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
-# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From da5e93e41e30a67e522358221ec1bd9a49299f98 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 29 Jun 2023 12:19:48 +0000
Subject: [PATCH 60/68] ready for review

---
 src/deepsparse/transformers/eval_downstream.py | 14 ++++++--------
 src/deepsparse/transformers/metrics.py         |  5 ++++-
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 23af9770bc..9eb82381bf 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -101,8 +101,6 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
         if len(predictions) == batch_size:
             perplexity_metrics.add_batch(predictions)
             predictions = []
-        if idx == 32:
-            break
     return perplexity_metrics
 
 
@@ -477,14 +475,13 @@ def _split_train_val(train_dataset, val_ratio, seed=42):
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        # TODO: Not BERT anymore
+        # TODO: It is not BERT anymore, should we
+        # have another script or modify the existing one?
         description="Evaluate a BERT ONNX model on a downstream dataset"
     )
     parser.add_argument(
-        "-m",
-        "--model_path",
+        "model_path",
         type=str,
-        default="/home/ubuntu/damian/sparseml/deployment",
         help=(
             "The path to a directory containing model.onnx, config.json, and "
             "tokenizer.json files or SparseZoo stub to the model"
@@ -493,8 +490,9 @@ def parse_args():
     parser.add_argument(
         "-d",
         "--dataset",
+        choices=list(SUPPORTED_DATASETS.keys()),
+        required=True,
         type=str,
-        default="openai_humaneval",
     )
     parser.add_argument(
         "-v",
@@ -547,7 +545,7 @@ def parse_args():
         "--max-samples",
         help="the max number of samples to evaluate. Default is None or all samples",
         type=int,
-        default=32,
+        default=None,
     )
 
     parser.add_argument(
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index ffce0dbb98..cdcfb293df 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -77,7 +77,6 @@ def add_batch(self, predictions: List[str]):
         encoded_texts = encodings["input_ids"]
         attention_masks = encodings["attention_mask"]
 
-        # split input_text into non-overlapping batches of `batch_size`
         for start_index in tqdm(range(0, len(encoded_texts), self._batch_size)):
             end_index = min(start_index + self._batch_size, len(encoded_texts))
             encoded_batch = encoded_texts[start_index:end_index]
@@ -107,6 +106,10 @@ def add_batch(self, predictions: List[str]):
             self.perplexities.extend(perplexity_batch.numpy().tolist())
 
     def compute(self) -> Dict[str, Any]:
+        """
+        :return: A dictionary containing the mean perplexity
+            and the list of perplexities
+        """
         return {
             "mean_perplexity": numpy.mean(self.perplexities),
             "perplexities": self.perplexities,

From a680dac16272c8fc75fa667eb3615a3a44340656 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Thu, 29 Jun 2023 12:22:29 +0000
Subject: [PATCH 61/68] ready for testing

---
 .../transformers/pipelines/text_generation.py    | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index e25cea10e3..e1ff03fa80 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -132,6 +132,7 @@ def __init__(
         self.tokenizer.padding_side = tokenizer_padding_side
 
         self.engine = None
+
         self.multitoken_engine = NLDecoderEngine(
             onnx_file_path=self.onnx_file_path,
             engine_type=self.engine_type,
@@ -160,6 +161,15 @@ def __init__(
                 tokenizer=self.tokenizer,
                 use_deepsparse_cache=use_deepsparse_cache,
             )
+        if (
+            not self.multitoken_engine.kv_cache_enabled
+            and self.max_generated_tokens > 1
+        ):
+            raise ValueError(
+                "The model used for inference does not support kv cache. It is "
+                "assumed that it maps from the token sequence to predicted logits."
+                "Set `max_generated_tokens` to 1 to support that scenario."
+            )
 
     @staticmethod
     def route_input_to_bucket(
@@ -265,12 +275,6 @@ def engine_forward(
             of logits for each generated token
         """
         if not self.multitoken_engine.kv_cache_enabled:
-            if self.max_generated_tokens != 1:
-                raise ValueError(
-                    "The model used for inference does not support kv cache. It is "
-                    "assumed that it maps from the token sequence to predicted logits."
-                    "Set `max_generated_tokens` to 1 to support that scenario."
-                )
             tokens, logits = self.multitoken_engine(engine_inputs)
             tokens = [tokens]
         else:

From f83dcabbd653cd93ce8301878e7f8841824eebcb Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 3 Jul 2023 09:37:04 +0000
Subject: [PATCH 62/68] assert proper padding on pipeline init

---
 src/deepsparse/transformers/pipelines/text_generation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index e1ff03fa80..52c227dc00 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -130,6 +130,8 @@ def __init__(
         self.force_max_tokens = force_max_tokens
 
         self.tokenizer.padding_side = tokenizer_padding_side
+        if not self.tokenizer.pad_token:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
 
         self.engine = None
 
@@ -212,8 +214,6 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
         :return: the inputs for the engine
         """
 
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-
         input_tokens = self.tokenizer(
             inputs.sequences,
             return_tensors="np",

From e659c33190088dd2d73f05e1ee4d591c11ce8e31 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 3 Jul 2023 12:48:24 +0000
Subject: [PATCH 63/68] now also supporting kv cache perplexity. time for
 cleanup

---
 src/deepsparse/pipeline.py                               | 3 ++-
 src/deepsparse/transformers/eval_downstream.py           | 2 +-
 src/deepsparse/transformers/pipelines/text_generation.py | 6 ++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index 34badf3f5b..dc6be77382 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -261,7 +261,8 @@ def __call__(self, *args, **kwargs) -> BaseModel:
             batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
             # submit split batches to engine threadpool
-            batch_outputs = list(self.executor.map(self.engine_forward, batches))
+            # batch_outputs = list(self.executor.map(self.engine_forward, batches))
+            batch_outputs = [self.engine_forward(x) for x in batches]
 
             # join together the batches of size `self._batch_size`
             engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 9eb82381bf..09155f272c 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -78,7 +78,7 @@
 ORT_ENGINE = "onnxruntime"
 
 
-def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
+def perplexity_eval(args, batch_size=1, dataset_name="openai_humaneval"):
     dataset = load_dataset(dataset_name)["test"]
 
     text_generation = Pipeline.create(
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 52c227dc00..9b47c0d18d 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -322,7 +322,7 @@ def prompt_inference(
             ['batch_size', 'num_tokens', 'vocab_size'])
         """
         # get tokens by attention mask
-        tokens = engine_inputs[0][engine_inputs[1].nonzero()].tolist()
+        tokens = engine_inputs[0][0].tolist()
         new_token = None
         num_tokens_processed = 0
 
@@ -343,15 +343,17 @@ def prompt_inference(
 
         # prompt size is small, run autoregressive inference to populate kv cache
         run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]
+        logits = []
         for token in tokens[num_tokens_processed:]:
             run_tokens.append(token)
             new_token, new_logits = self.autoregressive_inference(
                 run_tokens, shift_positions_by_one=not bool(num_tokens_processed)
             )
+            logits.append(new_logits)
 
         tokens.append(new_token)
 
-        return tokens, new_logits
+        return tokens, numpy.concatenate(logits, axis=1)
 
     def autoregressive_inference(
         self,

From cf74ad7c6537454008c5e2444dcc8eb06d461d50 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 3 Jul 2023 14:48:22 +0000
Subject: [PATCH 64/68] ready for review

---
 src/deepsparse/pipeline.py                    |  3 +-
 .../transformers/eval_downstream.py           |  7 +--
 src/deepsparse/transformers/metrics.py        |  4 +-
 .../transformers/pipelines/text_generation.py | 57 +++++++++++++------
 4 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
index dc6be77382..34badf3f5b 100644
--- a/src/deepsparse/pipeline.py
+++ b/src/deepsparse/pipeline.py
@@ -261,8 +261,7 @@ def __call__(self, *args, **kwargs) -> BaseModel:
             batches = self.split_engine_inputs(engine_inputs, self._batch_size)
 
             # submit split batches to engine threadpool
-            # batch_outputs = list(self.executor.map(self.engine_forward, batches))
-            batch_outputs = [self.engine_forward(x) for x in batches]
+            batch_outputs = list(self.executor.map(self.engine_forward, batches))
 
             # join together the batches of size `self._batch_size`
             engine_outputs = self.join_engine_outputs(batch_outputs)
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 09155f272c..4a78dcec09 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -78,22 +78,21 @@
 ORT_ENGINE = "onnxruntime"
 
 
-def perplexity_eval(args, batch_size=1, dataset_name="openai_humaneval"):
+def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
     dataset = load_dataset(dataset_name)["test"]
 
     text_generation = Pipeline.create(
         task="text-generation",
         model_path=args.model_path,
-        # TODO: make sure this also works for deepsparse engine
-        engine_type="onnxruntime",
+        engine_type=args.engine_type,
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
         prompt_processing_sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
         tokenizer_padding_side="right",
+        remove_special_tokens_from_prompt=False,
     )
     perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size)
-    # TODO: text_generation.engine is None
     print(f"Engine info: {text_generation.engine}")
     predictions = []
     for idx, sample in _enumerate_progress(dataset, args.max_samples):
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index cdcfb293df..3eba344a0c 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -82,7 +82,9 @@ def add_batch(self, predictions: List[str]):
             encoded_batch = encoded_texts[start_index:end_index]
             attention_mask = attention_masks[start_index:end_index]
 
-            out = self._pipeline(sequences=predictions, return_logits=True)
+            out = self._pipeline(
+                sequences=predictions, return_logits=True, truncate=True
+            )
             logits = out.logits
 
             labels = encoded_batch
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 9b47c0d18d..07e2cc3e76 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from typing import Dict, List, Optional, Tuple, Type, Union
+import warnings
+from typing import List, Optional, Tuple, Type, Union
 
 import numpy
 from pydantic import BaseModel, Field
@@ -42,6 +42,13 @@ class TextGenerationInput(BaseModel):
         "and the model is using kv cache, it "
         "will be set to a random uuid.",
     )
+    truncate: bool = Field(
+        default=False,
+        description="A flag that indicates whether to truncate "
+        "the input text sequence. Useful, when a batch of "
+        "predictions needs to have consistent length so one"
+        "can compute metric in a batched fashion. ",
+    )
 
 
 class TextGenerationOutput(BaseModel):
@@ -91,6 +98,8 @@ class TextGenerationPipeline(TransformersPipeline):
         for caching the model outputs.
     :param tokenizer_padding_side: the side to pad the input sequence to.
         Either "left" or "right". Defaults to "left".
+    :param remove_special_tokens_from_prompt: if True, the pipeline will remove
+        the special tokens from the prompt, before processing it. Defaults to True.
     :param kwargs: kwargs to pass to the TransformersPipeline
     """
 
@@ -104,6 +113,7 @@ def __init__(
         force_max_tokens: bool = False,
         use_deepsparse_cache: bool = False,
         tokenizer_padding_side: str = "left",
+        remove_special_tokens_from_prompt: bool = True,
         **kwargs,
     ):
         if use_deepsparse_cache:
@@ -119,6 +129,12 @@ def __init__(
                 "supported for text generation pipelines"
             )
 
+        if tokenizer_padding_side != "left":
+            warnings.warn(
+                "By default the tokenizer padding side is set to left. "
+                f"Setting it to {tokenizer_padding_side} may result in "
+                "unexpected behavior."
+            )
         super().__init__(
             **kwargs, _delay_engine_initialize=True, _delay_overwriting_inputs=True
         )
@@ -128,6 +144,7 @@ def __init__(
         self.max_generated_tokens = max_generated_tokens
         self.prompt_processing_sequence_length = prompt_processing_sequence_length
         self.force_max_tokens = force_max_tokens
+        self.remove_special_tokens_from_prompt = remove_special_tokens_from_prompt
 
         self.tokenizer.padding_side = tokenizer_padding_side
         if not self.tokenizer.pad_token:
@@ -219,8 +236,7 @@ def process_inputs(self, inputs: TextGenerationInput) -> List[numpy.ndarray]:
             return_tensors="np",
             max_length=self.sequence_length,
             padding="max_length",
-            # TODO: Truncating by default may be a problem
-            truncation=True,
+            truncation=inputs.truncate,
         )
 
         attention_mask = input_tokens["attention_mask"]
@@ -254,9 +270,7 @@ def process_engine_outputs(
         """
         generated_tokens, generated_logits = engine_outputs
         sequences = self.tokenizer.batch_decode(
-            # TODO: hack for now, make it general
-            *generated_tokens[0],
-            skip_special_tokens=True,
+            generated_tokens, skip_special_tokens=True
         )
         logits = generated_logits if kwargs.get("return_logits") else None
 
@@ -275,11 +289,12 @@ def engine_forward(
             of logits for each generated token
         """
         if not self.multitoken_engine.kv_cache_enabled:
-            tokens, logits = self.multitoken_engine(engine_inputs)
-            tokens = [tokens]
+            tokens, prompt_logits = self.multitoken_engine(engine_inputs)
+            return numpy.array([tokens]), prompt_logits
+
         else:
             # run the prompt through
-            tokens, logits = self.prompt_inference(engine_inputs)
+            tokens, prompt_logits = self.prompt_inference(engine_inputs)
 
         # create the generated output
         max_tokens = (
@@ -289,7 +304,7 @@ def engine_forward(
         )  # set safety for absolute max generation
 
         generated_tokens = [tokens[-1]]
-        generated_logits = [logits]
+        generated_logits = prompt_logits
 
         while len(generated_tokens) < max_tokens:
             (
@@ -303,13 +318,13 @@ def engine_forward(
             if token == self.tokenizer.eos_token_id and not self.force_max_tokens:
                 break
 
-        return numpy.array([[generated_tokens]]), numpy.concatenate(
+        return numpy.array(generated_tokens), numpy.concatenate(
             generated_logits, axis=1
         )
 
     def prompt_inference(
         self, engine_inputs: List[numpy.ndarray]
-    ) -> Tuple[List[int], Dict[str, numpy.ndarray]]:
+    ) -> Tuple[List[int], List[numpy.ndarray]]:
         """
         An inference run that processes the prompt through the
         model to generate the new token and logits
@@ -321,8 +336,14 @@ def prompt_inference(
             - The logits generated from the prompt (with dimensions
             ['batch_size', 'num_tokens', 'vocab_size'])
         """
-        # get tokens by attention mask
-        tokens = engine_inputs[0][0].tolist()
+        tokens = engine_inputs[0]
+        if self.remove_special_tokens_from_prompt:
+            # get tokens by attention mask
+            tokens = tokens[engine_inputs[1].nonzero()].tolist()
+        else:
+            tokens = tokens[0].tolist()
+
+        prompt_logits = []
         new_token = None
         num_tokens_processed = 0
 
@@ -343,17 +364,17 @@ def prompt_inference(
 
         # prompt size is small, run autoregressive inference to populate kv cache
         run_tokens = [] if num_tokens_processed == 0 else tokens[:num_tokens_processed]
-        logits = []
+
         for token in tokens[num_tokens_processed:]:
             run_tokens.append(token)
             new_token, new_logits = self.autoregressive_inference(
                 run_tokens, shift_positions_by_one=not bool(num_tokens_processed)
             )
-            logits.append(new_logits)
+            prompt_logits.append(new_logits)
 
         tokens.append(new_token)
 
-        return tokens, numpy.concatenate(logits, axis=1)
+        return tokens, prompt_logits
 
     def autoregressive_inference(
         self,

From 853f8764f09c7537f5b8ce19ecd70e1f071554f1 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 3 Jul 2023 15:12:35 +0000
Subject: [PATCH 65/68] correctly print engine info

---
 .../transformers/engines/nl_decoder_engine.py          |  3 +++
 src/deepsparse/transformers/eval_downstream.py         | 10 ++++++++--
 .../transformers/pipelines/text_generation.py          |  3 ++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index cd778e53e4..5a7e7ac13c 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -250,6 +250,9 @@ def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
 
         return numpy.random.choice(len(probs), p=probs)
 
+    def __str__(self):
+        return f"NLDecoderEngine: {self.engine}"
+
     def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]:
         # initialize empty kv cache of size
         # (batch_size, num_attention_heads, length, hidden_dims)
diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 4a78dcec09..01bf5e1ef3 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -84,7 +84,7 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
     text_generation = Pipeline.create(
         task="text-generation",
         model_path=args.model_path,
-        engine_type=args.engine_type,
+        engine_type=args.engine,
         num_cores=args.num_cores,
         sequence_length=args.max_sequence_length,
         prompt_processing_sequence_length=args.max_sequence_length,
@@ -93,7 +93,13 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
         remove_special_tokens_from_prompt=False,
     )
     perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size)
-    print(f"Engine info: {text_generation.engine}")
+    active_engines = [
+        engine
+        for engine in [text_generation.engine, text_generation.multitoken_engine]
+        if engine
+    ]
+    print("Engine info: ")
+    [print(f"{engine}\n") for engine in active_engines]
     predictions = []
     for idx, sample in _enumerate_progress(dataset, args.max_samples):
         predictions.append(sample["prompt"] + sample["canonical_solution"])
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 07e2cc3e76..83667cf599 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -33,7 +33,8 @@ class TextGenerationInput(BaseModel):
     return_logits: bool = Field(
         default=False,
         description="A flag that indicates whether to return "
-        "the logits for the generated text sequence. ",
+        "the logits for the input text sequence and the "
+        "generated text sequence. ",
     )
     session_id: Optional[str] = Field(
         default=None,

From e8da07e31d0088276644a1b8ac084cb56884ee65 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 3 Jul 2023 16:41:09 +0000
Subject: [PATCH 66/68] work with left padding of the tokenizer

---
 src/deepsparse/transformers/eval_downstream.py   |  8 +++-----
 src/deepsparse/transformers/metrics.py           | 16 ++++++++++++++--
 .../transformers/pipelines/text_generation.py    | 11 ++---------
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/deepsparse/transformers/eval_downstream.py b/src/deepsparse/transformers/eval_downstream.py
index 01bf5e1ef3..6e6fa16b20 100644
--- a/src/deepsparse/transformers/eval_downstream.py
+++ b/src/deepsparse/transformers/eval_downstream.py
@@ -68,15 +68,12 @@
 import numpy
 from tqdm.auto import tqdm
 
-from deepsparse import Pipeline
+from deepsparse import DEEPSPARSE_ENGINE, ORT_ENGINE, Pipeline
 from deepsparse.transformers.metrics import Perplexity, PrecisionRecallF1
 
 
 from datasets import load_dataset, load_metric  # isort: skip
 
-DEEPSPARSE_ENGINE = "deepsparse"
-ORT_ENGINE = "onnxruntime"
-
 
 def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
     dataset = load_dataset(dataset_name)["test"]
@@ -89,7 +86,6 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
         sequence_length=args.max_sequence_length,
         prompt_processing_sequence_length=args.max_sequence_length,
         max_generated_tokens=1,
-        tokenizer_padding_side="right",
         remove_special_tokens_from_prompt=False,
     )
     perplexity_metrics = Perplexity(pipeline=text_generation, batch_size=batch_size)
@@ -106,6 +102,8 @@ def perplexity_eval(args, batch_size=16, dataset_name="openai_humaneval"):
         if len(predictions) == batch_size:
             perplexity_metrics.add_batch(predictions)
             predictions = []
+        if args.max_samples and idx >= args.max_samples:
+            break
     return perplexity_metrics
 
 
diff --git a/src/deepsparse/transformers/metrics.py b/src/deepsparse/transformers/metrics.py
index 3eba344a0c..ef5dd521eb 100644
--- a/src/deepsparse/transformers/metrics.py
+++ b/src/deepsparse/transformers/metrics.py
@@ -88,11 +88,23 @@ def add_batch(self, predictions: List[str]):
             logits = out.logits
 
             labels = encoded_batch
+            labels = numpy.stack(labels)
+            attention_mask = numpy.stack(attention_mask)
+
+            # because the tokenizer is left padded, we need to move the meaningful
+            # part of the logits and labels to the right
+            num_padded_entries = attention_mask.sum(axis=1)
+
+            # shift the values at num_paddings to the top of the array using roll
+            for i, num_padded in enumerate(num_padded_entries):
+                logits[i] = numpy.roll(logits[i], num_padded, axis=0)
+                labels[i] = numpy.roll(labels[i], num_padded, axis=0)
+                attention_mask[i] = numpy.roll(attention_mask[i], num_padded, axis=0)
 
             # shift logits and labels create the input and target for the loss function
             shift_logits = logits[:, :-1, :]
-            shift_labels = numpy.stack(labels)[:, 1:]
-            shift_attention_mask_batch = numpy.stack(attention_mask)[:, 1:]
+            shift_labels = labels[:, 1:]
+            shift_attention_mask_batch = attention_mask[:, 1:]
 
             # compute perplexity for this batch
             perplexity_batch = torch.exp(
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 83667cf599..91e67f998a 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import warnings
+
 from typing import List, Optional, Tuple, Type, Union
 
 import numpy
@@ -113,7 +113,6 @@ def __init__(
         prompt_processing_sequence_length: int = 128,
         force_max_tokens: bool = False,
         use_deepsparse_cache: bool = False,
-        tokenizer_padding_side: str = "left",
         remove_special_tokens_from_prompt: bool = True,
         **kwargs,
     ):
@@ -130,12 +129,6 @@ def __init__(
                 "supported for text generation pipelines"
             )
 
-        if tokenizer_padding_side != "left":
-            warnings.warn(
-                "By default the tokenizer padding side is set to left. "
-                f"Setting it to {tokenizer_padding_side} may result in "
-                "unexpected behavior."
-            )
         super().__init__(
             **kwargs, _delay_engine_initialize=True, _delay_overwriting_inputs=True
         )
@@ -147,7 +140,7 @@ def __init__(
         self.force_max_tokens = force_max_tokens
         self.remove_special_tokens_from_prompt = remove_special_tokens_from_prompt
 
-        self.tokenizer.padding_side = tokenizer_padding_side
+        self.tokenizer.padding_side = "left"
         if not self.tokenizer.pad_token:
             self.tokenizer.pad_token = self.tokenizer.eos_token
 

From 58b12c8d447cd9a12f6411c020fe094b72735e76 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Mon, 3 Jul 2023 17:02:23 +0000
Subject: [PATCH 67/68] quality

---
 src/deepsparse/transformers/engines/nl_decoder_engine.py | 2 +-
 src/deepsparse/transformers/pipelines/text_generation.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/deepsparse/transformers/engines/nl_decoder_engine.py b/src/deepsparse/transformers/engines/nl_decoder_engine.py
index 5a7e7ac13c..f75264db14 100644
--- a/src/deepsparse/transformers/engines/nl_decoder_engine.py
+++ b/src/deepsparse/transformers/engines/nl_decoder_engine.py
@@ -251,7 +251,7 @@ def generate_token(self, logits: numpy.ndarray) -> numpy.ndarray:
         return numpy.random.choice(len(probs), p=probs)
 
     def __str__(self):
-        return f"NLDecoderEngine: {self.engine}"
+        return f"{self.__class__.__name__}: {self.engine}"
 
     def _initialize_kv_cache_state(self, length: int) -> Dict[str, numpy.ndarray]:
         # initialize empty kv cache of size
diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 91e67f998a..2bcfe98ecd 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -97,8 +97,6 @@ class TextGenerationPipeline(TransformersPipeline):
         of tokens supplied even if the stop token is reached.
     :param use_deepsparse_cache: if True, the pipeline will use the deepsparse kv cache
         for caching the model outputs.
-    :param tokenizer_padding_side: the side to pad the input sequence to.
-        Either "left" or "right". Defaults to "left".
     :param remove_special_tokens_from_prompt: if True, the pipeline will remove
         the special tokens from the prompt, before processing it. Defaults to True.
     :param kwargs: kwargs to pass to the TransformersPipeline
@@ -140,6 +138,7 @@ def __init__(
         self.force_max_tokens = force_max_tokens
         self.remove_special_tokens_from_prompt = remove_special_tokens_from_prompt
 
+        # override tokenizer to pad to left
         self.tokenizer.padding_side = "left"
         if not self.tokenizer.pad_token:
             self.tokenizer.pad_token = self.tokenizer.eos_token

From eecd2323418ec119ad929ce27a6f0ed2a9bd7ea7 Mon Sep 17 00:00:00 2001
From: Damian <damian@neuralmagic.com>
Date: Wed, 5 Jul 2023 10:03:26 +0000
Subject: [PATCH 68/68] fix the multitoken inference

---
 src/deepsparse/transformers/pipelines/text_generation.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/deepsparse/transformers/pipelines/text_generation.py b/src/deepsparse/transformers/pipelines/text_generation.py
index 2bcfe98ecd..4a41b8b32d 100644
--- a/src/deepsparse/transformers/pipelines/text_generation.py
+++ b/src/deepsparse/transformers/pipelines/text_generation.py
@@ -262,6 +262,9 @@ def process_engine_outputs(
         :return: the output schema for the pipeline
         """
         generated_tokens, generated_logits = engine_outputs
+        if generated_tokens.ndim == 1:
+            # if we have a single dimension, add a batch dimension
+            generated_tokens = generated_tokens[None, :]
         sequences = self.tokenizer.batch_decode(
             generated_tokens, skip_special_tokens=True
         )
@@ -350,6 +353,7 @@ def prompt_inference(
             ]
             new_token, new_logits = self.multitoken_engine(engine_inputs)
             num_tokens_processed = self.prompt_processing_sequence_length
+            prompt_logits.append(new_logits)
 
         if num_tokens_processed:
             # transfer the cache state from the multi-token engine to the main engine