neuralmagic · bfineran · Jul 12, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 6, 2023
diff --git a/src/deepsparse/engine.py b/src/deepsparse/engine.py
@@ -28,7 +28,6 @@
 from deepsparse.benchmark import BenchmarkResults
 from deepsparse.utils import (
     generate_random_inputs,
-    get_output_names,
     model_to_path,
     override_onnx_input_shapes,
 )
@@ -54,7 +53,6 @@
     "Scheduler",
     "Context",
     "MultiModelEngine",
-    "KVCacheEngine",
     "BaseEngine",
 ]
 
@@ -845,52 +843,6 @@ def __init__(
             )
 
 
-class KVCacheEngine(Engine):
-    """
-    Engine that can do kv caching.
-    """
-
-    def __init__(
-        self,
-        model: Union[str, "Model", "File"],
-        batch_size: int = 1,
-        num_cores: int = None,
-        num_streams: int = None,
-        scheduler: Scheduler = None,
-        input_shapes: List[List[int]] = None,
-        kv_cache_bools: List[bool] = None,
-        prev_cache_length: int = 0,
-    ):
-        BaseEngine.construct(
-            self, model, batch_size, num_cores, num_streams, scheduler, input_shapes
-        )
-
-        if kv_cache_bools is None:
-            # If no list was provided, then we assume all outputs except for the first are KV caches
-            # Note: In the future we can look at the names of outputs to be more sure
-            #
-            # Create a boolean list of every output of the model
-            output_names = get_output_names(self._model_path)
-            kv_cache_bools = [True for i in range(len(output_names))]
-            # Assume first input is logits and logits ought not to be cached
-            kv_cache_bools[0] = False
-
-        num_streams = _validate_num_streams(num_streams, self._num_cores)
-        if self._input_shapes:
-            raise NotImplementedError("Don't do this yet :)")
-        else:
-            self._eng_net = LIB.deepsparse_engine(
-                self._model_path,
-                self._batch_size,
-                self._num_cores,
-                num_streams,
-                self._scheduler.value,
-                None,
-                kv_cache_bools,
-                prev_cache_length,
-            )
-
-
 def compile_model(
     model: Union[str, "Model", "File"],
     batch_size: int = 1,

diff --git a/src/deepsparse/pipeline.py b/src/deepsparse/pipeline.py
@@ -53,6 +53,7 @@
     "yolo_pipeline",
     "Bucketable",
     "BucketingPipeline",
+    "create_engine",
 ]
 
 DEEPSPARSE_ENGINE = "deepsparse"
@@ -157,6 +158,7 @@ def __init__(
         logger: Optional[Union[BaseLogger, str]] = None,
         benchmark: bool = False,
         _delay_engine_initialize: bool = False,  # internal use only
+        _delay_overwriting_inputs: bool = False,  # internal use only
     ):
         self._benchmark = benchmark
         self._model_path_orig = model_path
@@ -200,7 +202,7 @@ def __init__(
         if engine_type.lower() == DEEPSPARSE_ENGINE:
             self._engine_args["scheduler"] = scheduler
 
-        self.onnx_file_path = self.setup_onnx_file_path()
+        self.onnx_file_path = self.setup_onnx_file_path(_delay_overwriting_inputs)
 
         if _delay_engine_initialize:
             self.engine = None
@@ -810,26 +812,10 @@ def log_inference_times(self, timer: StagedTimer):
                 category=MetricCategories.SYSTEM,
             )
 
-    def _initialize_engine(self) -> Union[Engine, ORTEngine]:
-        engine_type = self.engine_type.lower()
-
-        if engine_type == DEEPSPARSE_ENGINE:
-            if self.context is not None and isinstance(self.context, Context):
-                self._engine_args.pop("num_cores", None)
-                self._engine_args.pop("scheduler", None)
-                self._engine_args["context"] = self.context
-                return MultiModelEngine(
-                    model=self.onnx_file_path,
-                    **self._engine_args,
-                )
-            return Engine(self.onnx_file_path, **self._engine_args)
-        elif engine_type == ORT_ENGINE:
-            return ORTEngine(self.onnx_file_path, **self._engine_args)
-        else:
-            raise ValueError(
-                f"Unknown engine_type {self.engine_type}. Supported values include: "
-                f"{SUPPORTED_PIPELINE_ENGINES}"
-            )
+    def _initialize_engine(self) -> Union[Engine, MultiModelEngine, ORTEngine]:
+        return create_engine(
+            self.onnx_file_path, self.engine_type, self._engine_args, self.context
+        )
 
     def _identifier(self):
         # get pipeline identifier; used in the context of logging
@@ -1007,6 +993,34 @@ def route_input_to_bucket(
         pass
 
 
+def create_engine(
+    onnx_file_path: str,
+    engine_type: str,
+    engine_args: Dict,
+    context: Optional[Context] = None,
+) -> Union[Engine, MultiModelEngine, ORTEngine]:
+    engine_type = engine_type.lower()
+
+    if engine_type == DEEPSPARSE_ENGINE:
+        if context is not None and isinstance(context, Context):
+            engine_args.pop("num_cores", None)
+            engine_args.pop("scheduler", None)
+            engine_args["context"] = context
+            return MultiModelEngine(
+                model=onnx_file_path,
+                **engine_args,
+            )
+        return Engine(onnx_file_path, **engine_args)
+
+    if engine_type == ORT_ENGINE:
+        return ORTEngine(onnx_file_path, **engine_args)
+
+    raise ValueError(
+        f"Unknown engine_type {engine_type}. Supported values include: "
+        f"{SUPPORTED_PIPELINE_ENGINES}"
+    )
+
+
 def _initialize_executor_and_workers(
     batch_size: Optional[int],
     workers_or_executor: Optional[Union[int, ThreadPoolExecutor]],

diff --git a/src/deepsparse/tasks.py b/src/deepsparse/tasks.py
@@ -95,6 +95,12 @@ class SupportedTasks:
         ),
     )
 
+    text_generation = namedtuple("text_generation", ["opt", "codegen", "bloom"])(
+        codegen=AliasedTask("codegen", []),
+        opt=AliasedTask("opt", []),
+        bloom=AliasedTask("bloom", []),
+    )
+
     image_classification = namedtuple("image_classification", ["image_classification"])(
         image_classification=AliasedTask(
             "image_classification",
@@ -150,6 +156,9 @@ def check_register_task(
             # custom task, register the CustomPipeline
             import deepsparse.pipelines.custom_pipeline  # noqa: F401
 
+        elif cls.is_text_generation(task):
+            import deepsparse.transformers.pipelines.text_generation  # noqa: F401
+
         elif cls.is_nlp(task):
             # trigger transformers pipelines to register with Pipeline.register
             import deepsparse.transformers.pipelines  # noqa: F401
@@ -193,6 +202,20 @@ def check_register_task(
                 f"{list(all_tasks)}"
             )
 
+    @classmethod
+    def is_text_generation(cls, task: str) -> bool:
+        """
+        :param task: the name of the task to check whether it is a text generation task
+            such as codegen
+        :return: True if it is a text generation task, False otherwise
+        """
+        return any(
+            [
+                text_generation_task.matches(task)
+                for text_generation_task in cls.text_generation
+            ]
+        )
+
     @classmethod
     def is_nlp(cls, task: str) -> bool:
         """

diff --git a/src/deepsparse/transformers/engines/__init__.py b/src/deepsparse/transformers/engines/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .nl_decoder_engine import *