From c9b274292877e5bbe6ce6c42fd4fc3a5755c87b9 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Mon, 10 Jun 2024 13:48:39 +0800
Subject: [PATCH 01/14] cast image features to model.dtype where needed to
 support FP16 or other precision in pipelines

---
 src/transformers/pipelines/depth_estimation.py            | 2 ++
 src/transformers/pipelines/document_question_answering.py | 5 ++++-
 src/transformers/pipelines/image_classification.py        | 4 ++++
 src/transformers/pipelines/image_feature_extraction.py    | 6 +++++-
 src/transformers/pipelines/image_segmentation.py          | 6 ++++++
 src/transformers/pipelines/image_to_image.py              | 4 ++++
 src/transformers/pipelines/image_to_text.py               | 8 ++++++++
 src/transformers/pipelines/mask_generation.py             | 2 ++
 src/transformers/pipelines/object_detection.py            | 2 ++
 src/transformers/pipelines/video_classification.py        | 4 ++++
 src/transformers/pipelines/visual_question_answering.py   | 4 ++++
 .../pipelines/zero_shot_image_classification.py           | 2 ++
 src/transformers/pipelines/zero_shot_object_detection.py  | 2 ++
 13 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index c6431a499717..71e7c61cc5db 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -91,6 +91,8 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout)
         self.image_size = image.size
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == 'pt':
+            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index 64714390b04f..9c307d49a074 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -294,7 +294,10 @@ def preprocess(
         if input.get("image", None) is not None:
             image = load_image(input["image"], timeout=timeout)
             if self.image_processor is not None:
-                image_features.update(self.image_processor(images=image, return_tensors=self.framework))
+                image_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == 'pt':
+                    image_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_inputs.items()}
+                image_features.update(image_inputs)
             elif self.feature_extractor is not None:
                 image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
             elif self.model_type == ModelType.VisionEncoderDecoder:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 62793c252a6b..f53ea65de3a6 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -23,6 +23,8 @@
     from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 if is_torch_available():
+    import torch
+
     from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
@@ -159,6 +161,8 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == 'pt':
+            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
index 3a361deabd79..eafd74521596 100644
--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@@ -1,12 +1,14 @@
 from typing import Dict
 
-from ..utils import add_end_docstrings, is_vision_available
+from ..utils import add_end_docstrings, is_vision_available, is_torch_available
 from .base import GenericTensor, Pipeline, build_pipeline_init_args
 
 
 if is_vision_available():
     from ..image_utils import load_image
 
+if is_torch_available():
+    import torch
 
 @add_end_docstrings(
     build_pipeline_init_args(has_image_processor=True),
@@ -60,6 +62,8 @@ def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None,
     def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
+        if self.framework == 'pt':
+            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 23fbd4fb79b1..6fcae2c65856 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -12,6 +12,8 @@
     from ..image_utils import load_image
 
 if is_torch_available():
+    import torch
+
     from ..models.auto.modeling_auto import (
         MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
         MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
@@ -147,6 +149,8 @@ def preprocess(self, image, subtask=None, timeout=None):
             else:
                 kwargs = {"task_inputs": [subtask]}
             inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
+            if self.framework == 'pt':
+                inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
             inputs["task_inputs"] = self.tokenizer(
                 inputs["task_inputs"],
                 padding="max_length",
@@ -155,6 +159,8 @@ def preprocess(self, image, subtask=None, timeout=None):
             )["input_ids"]
         else:
             inputs = self.image_processor(images=[image], return_tensors="pt")
+            if self.framework == 'pt':
+                inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
         inputs["target_size"] = target_size
         return inputs
 
diff --git a/src/transformers/pipelines/image_to_image.py b/src/transformers/pipelines/image_to_image.py
index 8c34ee8dd3c8..0eadaa5da692 100644
--- a/src/transformers/pipelines/image_to_image.py
+++ b/src/transformers/pipelines/image_to_image.py
@@ -31,6 +31,8 @@
     from ..image_utils import load_image
 
 if is_torch_available():
+    import torch
+
     from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
@@ -119,6 +121,8 @@ def _forward(self, model_inputs):
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == 'pt':
+            inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
         return inputs
 
     def postprocess(self, model_outputs):
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 4a9a3744d841..8ed64a8f9c74 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -138,6 +138,8 @@ def preprocess(self, image, prompt=None, timeout=None):
 
             if model_type == "git":
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == 'pt':
+                    model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
                 input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
                 input_ids = [self.tokenizer.cls_token_id] + input_ids
                 input_ids = torch.tensor(input_ids).unsqueeze(0)
@@ -145,10 +147,14 @@ def preprocess(self, image, prompt=None, timeout=None):
 
             elif model_type == "pix2struct":
                 model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+                if self.framework == 'pt':
+                    model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
 
             elif model_type != "vision-encoder-decoder":
                 # vision-encoder-decoder does not support conditional generation
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == 'pt':
+                    model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
                 text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
                 model_inputs.update(text_inputs)
 
@@ -157,6 +163,8 @@ def preprocess(self, image, prompt=None, timeout=None):
 
         else:
             model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+            if self.framework == 'pt':
+                model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
 
         if self.model.config.model_type == "git" and prompt is None:
             model_inputs["input_ids"] = None
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index 68d407aff2d4..f3b2b04acbed 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -181,6 +181,8 @@ def preprocess(
             image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
         )
         model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+        if self.framework == 'pt':
+            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
 
         with self.device_placement():
             if self.framework == "pt":
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 36946cbf8a45..6b8003493386 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -107,6 +107,8 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         target_size = torch.IntTensor([[image.height, image.width]])
         inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == 'pt':
+            inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
         if self.tokenizer is not None:
             inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
         inputs["target_size"] = target_size
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 5702f23c5f60..0748ce3cd130 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -19,6 +19,8 @@
 
 
 if is_torch_available():
+    import torch
+
     from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
@@ -106,6 +108,8 @@ def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
         video = list(video)
 
         model_inputs = self.image_processor(video, return_tensors=self.framework)
+        if self.framework == 'pt':
+            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index 9455b0d85928..90902cd66f3e 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -10,6 +10,8 @@
     from ..image_utils import load_image
 
 if is_torch_available():
+    import torch
+
     from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
     from .pt_utils import KeyDataset
 
@@ -155,6 +157,8 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
             truncation=truncation,
         )
         image_features = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == 'pt':
+            image_features = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_features.items()}
         model_inputs.update(image_features)
         return model_inputs
 
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index 8e40d0e6a5cb..c4b0b721d953 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -120,6 +120,8 @@ def _sanitize_parameters(self, **kwargs):
     def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors=self.framework)
+        if self.framework == 'pt':
+            inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
         padding = "max_length" if self.model.config.model_type == "siglip" else True
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index 5be89332cbd9..026095652b0d 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -156,6 +156,8 @@ def preprocess(self, inputs, timeout=None):
         for i, candidate_label in enumerate(candidate_labels):
             text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
             image_features = self.image_processor(image, return_tensors=self.framework)
+            if self.framework == 'pt':
+                image_features = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_features.items()}
             yield {
                 "is_last": i == len(candidate_labels) - 1,
                 "target_size": target_size,

From 18ae03b3d484716bc35478787019d585e04a035a Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 11 Jun 2024 08:18:50 +0800
Subject: [PATCH 02/14] Update
 src/transformers/pipelines/image_feature_extraction.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 src/transformers/pipelines/image_feature_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
index eafd74521596..0546db28fff1 100644
--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@@ -1,6 +1,6 @@
 from typing import Dict
 
-from ..utils import add_end_docstrings, is_vision_available, is_torch_available
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available
 from .base import GenericTensor, Pipeline, build_pipeline_init_args
 
 

From 6a28f4b577fce8d1f4057a6062db98788a1b6ca1 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Thu, 13 Jun 2024 23:50:46 +0800
Subject: [PATCH 03/14] Use .to instead

---
 src/transformers/pipelines/depth_estimation.py            | 2 +-
 src/transformers/pipelines/document_question_answering.py | 2 +-
 src/transformers/pipelines/image_classification.py        | 2 +-
 src/transformers/pipelines/image_feature_extraction.py    | 2 +-
 src/transformers/pipelines/image_segmentation.py          | 4 ++--
 src/transformers/pipelines/image_to_image.py              | 2 +-
 src/transformers/pipelines/image_to_text.py               | 8 ++++----
 src/transformers/pipelines/mask_generation.py             | 2 +-
 src/transformers/pipelines/object_detection.py            | 2 +-
 src/transformers/pipelines/video_classification.py        | 2 +-
 src/transformers/pipelines/visual_question_answering.py   | 2 +-
 .../pipelines/zero_shot_image_classification.py           | 2 +-
 src/transformers/pipelines/zero_shot_object_detection.py  | 2 +-
 13 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index 71e7c61cc5db..c27d6e2b7724 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -92,7 +92,7 @@ def preprocess(self, image, timeout=None):
         self.image_size = image.size
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         if self.framework == 'pt':
-            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index 9c307d49a074..f97f9e098d5d 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -296,7 +296,7 @@ def preprocess(
             if self.image_processor is not None:
                 image_inputs = self.image_processor(images=image, return_tensors=self.framework)
                 if self.framework == 'pt':
-                    image_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_inputs.items()}
+                    image_inputs = image_inputs.to(self.torch_dtype)
                 image_features.update(image_inputs)
             elif self.feature_extractor is not None:
                 image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index f53ea65de3a6..4e4ef96e1c02 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -162,7 +162,7 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
         if self.framework == 'pt':
-            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
index 0546db28fff1..643c3617b664 100644
--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@@ -63,7 +63,7 @@ def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str,
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
         if self.framework == 'pt':
-            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 6fcae2c65856..dfb43eca6c7f 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -150,7 +150,7 @@ def preprocess(self, image, subtask=None, timeout=None):
                 kwargs = {"task_inputs": [subtask]}
             inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
             if self.framework == 'pt':
-                inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
+                inputs = inputs.to(self.torch_dtype)
             inputs["task_inputs"] = self.tokenizer(
                 inputs["task_inputs"],
                 padding="max_length",
@@ -160,7 +160,7 @@ def preprocess(self, image, subtask=None, timeout=None):
         else:
             inputs = self.image_processor(images=[image], return_tensors="pt")
             if self.framework == 'pt':
-                inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
+                inputs = inputs.to(self.torch_dtype)
         inputs["target_size"] = target_size
         return inputs
 
diff --git a/src/transformers/pipelines/image_to_image.py b/src/transformers/pipelines/image_to_image.py
index 0eadaa5da692..c11ade685d7c 100644
--- a/src/transformers/pipelines/image_to_image.py
+++ b/src/transformers/pipelines/image_to_image.py
@@ -122,7 +122,7 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors="pt")
         if self.framework == 'pt':
-            inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
+            inputs = inputs.to(self.torch_dtype)
         return inputs
 
     def postprocess(self, model_outputs):
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 8ed64a8f9c74..8cc76c37e39e 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -139,7 +139,7 @@ def preprocess(self, image, prompt=None, timeout=None):
             if model_type == "git":
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
                 if self.framework == 'pt':
-                    model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+                    model_inputs = model_inputs.to(self.torch_dtype)
                 input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
                 input_ids = [self.tokenizer.cls_token_id] + input_ids
                 input_ids = torch.tensor(input_ids).unsqueeze(0)
@@ -148,13 +148,13 @@ def preprocess(self, image, prompt=None, timeout=None):
             elif model_type == "pix2struct":
                 model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
                 if self.framework == 'pt':
-                    model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+                    model_inputs = model_inputs.to(self.torch_dtype)
 
             elif model_type != "vision-encoder-decoder":
                 # vision-encoder-decoder does not support conditional generation
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
                 if self.framework == 'pt':
-                    model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+                    model_inputs = model_inputs.to(self.torch_dtype)
                 text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
                 model_inputs.update(text_inputs)
 
@@ -164,7 +164,7 @@ def preprocess(self, image, prompt=None, timeout=None):
         else:
             model_inputs = self.image_processor(images=image, return_tensors=self.framework)
             if self.framework == 'pt':
-                model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+                model_inputs = model_inputs.to(self.torch_dtype)
 
         if self.model.config.model_type == "git" and prompt is None:
             model_inputs["input_ids"] = None
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index f3b2b04acbed..32d66a6924a4 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -182,7 +182,7 @@ def preprocess(
         )
         model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
         if self.framework == 'pt':
-            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+            model_inputs = model_inputs.to(self.torch_dtype)
 
         with self.device_placement():
             if self.framework == "pt":
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 6b8003493386..7536172ce8b8 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -108,7 +108,7 @@ def preprocess(self, image, timeout=None):
         target_size = torch.IntTensor([[image.height, image.width]])
         inputs = self.image_processor(images=[image], return_tensors="pt")
         if self.framework == 'pt':
-            inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
+            inputs = inputs.to(self.torch_dtype)
         if self.tokenizer is not None:
             inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
         inputs["target_size"] = target_size
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 0748ce3cd130..4a644522adca 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -109,7 +109,7 @@ def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
 
         model_inputs = self.image_processor(video, return_tensors=self.framework)
         if self.framework == 'pt':
-            model_inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in model_inputs.items()}
+            model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
     def _forward(self, model_inputs):
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index 90902cd66f3e..a174ac25c89a 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -158,7 +158,7 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
         )
         image_features = self.image_processor(images=image, return_tensors=self.framework)
         if self.framework == 'pt':
-            image_features = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_features.items()}
+            image_features = image_features.to(self.torch_dtype)
         model_inputs.update(image_features)
         return model_inputs
 
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index c4b0b721d953..de453aa52f2a 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -121,7 +121,7 @@ def preprocess(self, image, candidate_labels=None, hypothesis_template="This is
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors=self.framework)
         if self.framework == 'pt':
-            inputs = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in inputs.items()}
+            inputs = inputs.to(self.torch_dtype)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
         padding = "max_length" if self.model.config.model_type == "siglip" else True
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index 026095652b0d..52a108c627eb 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -157,7 +157,7 @@ def preprocess(self, inputs, timeout=None):
             text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
             image_features = self.image_processor(image, return_tensors=self.framework)
             if self.framework == 'pt':
-                image_features = {k: v.type(self.torch_dtype) if v.dtype == torch.float32 else v for k, v in image_features.items()}
+                image_features = image_features.to(self.torch_dtype)
             yield {
                 "is_last": i == len(candidate_labels) - 1,
                 "target_size": target_size,

From a929dd1a4e46a5aa61cdb6e3ff8967c1addbb469 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 11:42:54 +0800
Subject: [PATCH 04/14] Add FP16 pipeline support for zeroshot audio
 classification

---
 src/transformers/pipelines/zero_shot_audio_classification.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py
index c3606e3c2b83..d9109aebd9c5 100644
--- a/src/transformers/pipelines/zero_shot_audio_classification.py
+++ b/src/transformers/pipelines/zero_shot_audio_classification.py
@@ -121,6 +121,8 @@ def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is
         inputs = self.feature_extractor(
             [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
         )
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
         text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)

From 78973649f82a272f5e7ae9c48a89af0fe23b651f Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 12:04:47 +0800
Subject: [PATCH 05/14] Remove unused torch imports

---
 src/transformers/pipelines/image_classification.py      | 2 --
 src/transformers/pipelines/image_feature_extraction.py  | 3 ---
 src/transformers/pipelines/image_segmentation.py        | 2 --
 src/transformers/pipelines/image_to_image.py            | 2 --
 src/transformers/pipelines/video_classification.py      | 2 --
 src/transformers/pipelines/visual_question_answering.py | 2 --
 6 files changed, 13 deletions(-)

diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 4e4ef96e1c02..49e6574bc96a 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -23,8 +23,6 @@
     from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 if is_torch_available():
-    import torch
-
     from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
index 643c3617b664..5cd304f5f755 100644
--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@@ -7,9 +7,6 @@
 if is_vision_available():
     from ..image_utils import load_image
 
-if is_torch_available():
-    import torch
-
 @add_end_docstrings(
     build_pipeline_init_args(has_image_processor=True),
     """
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index dfb43eca6c7f..12e9353c9ba1 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -12,8 +12,6 @@
     from ..image_utils import load_image
 
 if is_torch_available():
-    import torch
-
     from ..models.auto.modeling_auto import (
         MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
         MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
diff --git a/src/transformers/pipelines/image_to_image.py b/src/transformers/pipelines/image_to_image.py
index c11ade685d7c..5c2acfe9eb30 100644
--- a/src/transformers/pipelines/image_to_image.py
+++ b/src/transformers/pipelines/image_to_image.py
@@ -31,8 +31,6 @@
     from ..image_utils import load_image
 
 if is_torch_available():
-    import torch
-
     from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 4a644522adca..c08a3bd3ab8e 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -19,8 +19,6 @@
 
 
 if is_torch_available():
-    import torch
-
     from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index a174ac25c89a..1ea279e2e81a 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -10,8 +10,6 @@
     from ..image_utils import load_image
 
 if is_torch_available():
-    import torch
-
     from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
     from .pt_utils import KeyDataset
 

From 20289fb92c8558df805dbbdaddb6e9aba700b680 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 12:17:20 +0800
Subject: [PATCH 06/14] Add docs on FP16 pipeline

---
 docs/source/en/main_classes/pipelines.md | 5 +++++
 docs/source/en/pipeline_tutorial.md      | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
index d7a701700d13..d5d132aaaba5 100644
--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@@ -270,6 +270,11 @@ This is a simplified view, since the pipeline can handle automatically the batch
 about how many forward passes you inputs are actually going to trigger, you can optimize the `batch_size`
 independently of the inputs. The caveats from the previous section still apply.
 
+## Pipeline FP16 inference
+Models can be run in FP16 which can be significantly faster on GPU while saving memory. Most models will not suffer noticeable performance loss from this. The larger the model, the less likely that it will.
+
+To enable FP16 inference, you can simply pass `torch_dtype=torch.float16` or `torch_dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.
+
 ## Pipeline custom code
 
 If you want to override a specific pipeline.
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 8518f639ab9d..838b89432b4a 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -113,7 +113,9 @@ This will work regardless of whether you are using PyTorch or Tensorflow.
 transcriber = pipeline(model="openai/whisper-large-v2", device=0)
 ```
 
-If the model is too large for a single GPU and you are using PyTorch, you can set `device_map="auto"` to automatically 
+If the model is too large for a single GPU and you are using PyTorch, you can set `torch_dtype='float16'` to enable FP16 precision inference. Usually this would not cause significant performance drops but make sure you evaluate it on your models!
+
+Alternatively, you can set `device_map="auto"` to automatically 
 determine how to load and store the model weights. Using the `device_map` argument requires the 🤗 [Accelerate](https://huggingface.co/docs/accelerate)
 package:
 
@@ -342,4 +344,3 @@ gr.Interface.from_pipeline(pipe).launch()
 
 By default, the web demo runs on a local server. If you'd like to share it with others, you can generate a temporary public
 link by setting `share=True` in `launch()`. You can also host your demo on [Hugging Face Spaces](https://huggingface.co/spaces) for a permanent link. 
-

From a1804716fd0d1e3e95ad67c7e4242af0fd7026c9 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 12:19:20 +0800
Subject: [PATCH 07/14] Remove unused import

---
 src/transformers/pipelines/image_feature_extraction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
index 5cd304f5f755..8e0b8b0151c1 100644
--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@@ -1,6 +1,6 @@
 from typing import Dict
 
-from ..utils import add_end_docstrings, is_torch_available, is_vision_available
+from ..utils import add_end_docstrings, is_vision_available
 from .base import GenericTensor, Pipeline, build_pipeline_init_args
 
 

From 88d9c2929bc463ac01db17fd28b81cdbc92ddd63 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 13:07:16 +0800
Subject: [PATCH 08/14] Add FP16 tests to pipeline mixin

---
 tests/test_pipeline_mixin.py | 190 ++++++++++++++++++++++++++++++++---
 1 file changed, 174 insertions(+), 16 deletions(-)

diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index e7c678cffb7d..9f51bc3f2646 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -126,16 +126,18 @@ class PipelineTesterMixin:
     pipeline_model_mapping = None
     supported_frameworks = ["pt", "tf"]
 
-    def run_task_tests(self, task):
+    def run_task_tests(self, task, torch_dtype='float32'):
         """Run pipeline tests for a specific `task`
 
         Args:
             task (`str`):
                 A task name. This should be a key in the mapping `pipeline_test_mapping`.
+            torch_dtype (`str`, `optional`, defaults to `'float32'`):
+                The torch dtype to use for the model. Can be used for FP16/other precision inference.
         """
         if task not in self.pipeline_model_mapping:
             self.skipTest(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: `{task}` is not in "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: `{task}` is not in "
                 f"`self.pipeline_model_mapping` for `{self.__class__.__name__}`."
             )
 
@@ -171,10 +173,10 @@ def run_task_tests(self, task):
                 repo_name = model_arch_name
 
             self.run_model_pipeline_tests(
-                task, repo_name, model_architecture, tokenizer_names, processor_names, commit
+                task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype
             )
 
-    def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit):
+    def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype='float32'):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class names
 
         Args:
@@ -188,6 +190,10 @@ def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenize
                 A list of names of a subclasses of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
             processor_names (`List[str]`):
                 A list of names of subclasses of `BaseImageProcessor` or `FeatureExtractionMixin`.
+            commit (`str`):
+                The commit hash of the model repository on the Hub.
+            torch_dtype (`str`, `optional`, defaults to `'float32'`):
+                The torch dtype to use for the model. Can be used for FP16/other precision inference.
         """
         # Get an instance of the corresponding class `XXXPipelineTests` in order to use `get_test_pipeline` and
         # `run_pipeline_test`.
@@ -201,16 +207,17 @@ def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenize
                     model_architecture,
                     tokenizer_name,
                     processor_name,
+                    torch_dtype
                 ):
                     logger.warning(
-                        f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: test is "
+                        f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
                         f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
                         f"`{tokenizer_name}` | processor `{processor_name}`."
                     )
                     continue
-                self.run_pipeline_test(task, repo_name, model_architecture, tokenizer_name, processor_name, commit)
+                self.run_pipeline_test(task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype)
 
-    def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit):
+    def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype='float32'):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class name
 
         The model will be loaded from a model repository on the Hub.
@@ -226,6 +233,10 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
                 The name of a subclass of `PreTrainedTokenizerFast` or `PreTrainedTokenizer`.
             processor_name (`str`):
                 The name of a subclass of `BaseImageProcessor` or `FeatureExtractionMixin`.
+            commit (`str`):
+                The commit hash of the model repository on the Hub.
+            torch_dtype (`str`, `optional`, defaults to `'float32'`):
+                The torch dtype to use for the model. Can be used for FP16/other precision inference.
         """
         repo_id = f"{TRANSFORMERS_TINY_MODEL_PATH}/{repo_name}"
         if TRANSFORMERS_TINY_MODEL_PATH != "hf-internal-testing":
@@ -245,7 +256,7 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
                 processor = processor_class.from_pretrained(repo_id, revision=commit)
             except Exception:
                 logger.warning(
-                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not load the "
+                    f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not load the "
                     f"processor from `{repo_id}` with `{processor_name}`."
                 )
                 return
@@ -253,7 +264,7 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
         # TODO: Maybe not upload such problematic tiny models to Hub.
         if tokenizer is None and processor is None:
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find or load "
                 f"any tokenizer / processor from `{repo_id}`."
             )
             return
@@ -263,15 +274,15 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
             model = model_architecture.from_pretrained(repo_id, revision=commit)
         except Exception:
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not find or load "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not find or load "
                 f"the model from `{repo_id}` with `{model_architecture}`."
             )
             return
 
         pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
-        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
+        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor, torch_dtype):
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: test is "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
                 f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
                 f"`{tokenizer_name}` | processor `{processor_name}`."
             )
@@ -287,12 +298,12 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
         # `run_pipeline_test`.
         task_test = pipeline_test_mapping[task]["test"]()
 
-        pipeline, examples = task_test.get_test_pipeline(model, tokenizer, processor)
+        pipeline, examples = task_test.get_test_pipeline(model, tokenizer, processor, torch_dtype=torch_dtype)
         if pipeline is None:
             # The test can disable itself, but it should be very marginal
             # Concerns: Wav2Vec2ForCTC without tokenizer test (FastTokenizer don't exist)
             logger.warning(
-                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')} is skipped: Could not get the "
+                f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: Could not get the "
                 "pipeline for testing."
             )
             return
@@ -322,10 +333,20 @@ def data(n):
     def test_pipeline_audio_classification(self):
         self.run_task_tests(task="audio-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_audio_classification_fp16(self):
+        self.run_task_tests(task="audio-classification", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_automatic_speech_recognition(self):
         self.run_task_tests(task="automatic-speech-recognition")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_automatic_speech_recognition_fp16(self):
+        self.run_task_tests(task="automatic-speech-recognition", torch_dtype='float16')
+
     @is_pipeline_test
     @require_vision
     @require_timm
@@ -333,6 +354,13 @@ def test_pipeline_automatic_speech_recognition(self):
     def test_pipeline_depth_estimation(self):
         self.run_task_tests(task="depth-estimation")
 
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_depth_estimation_fp16(self):
+        self.run_task_tests(task="depth-estimation", torch_dtype='float16')
+
     @is_pipeline_test
     @require_pytesseract
     @require_torch
@@ -340,20 +368,43 @@ def test_pipeline_depth_estimation(self):
     def test_pipeline_document_question_answering(self):
         self.run_task_tests(task="document-question-answering")
 
+    @is_pipeline_test
+    @require_pytesseract
+    @require_torch
+    @require_vision
+    def test_pipeline_document_question_answering_fp16(self):
+        self.run_task_tests(task="document-question-answering", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_feature_extraction(self):
         self.run_task_tests(task="feature-extraction")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_feature_extraction_fp16(self):
+        self.run_task_tests(task="feature-extraction", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_fill_mask(self):
         self.run_task_tests(task="fill-mask")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_fill_mask_fp16(self):
+        self.run_task_tests(task="fill-mask", torch_dtype='float16')
+
     @is_pipeline_test
     @require_torch_or_tf
     @require_vision
     def test_pipeline_image_classification(self):
         self.run_task_tests(task="image-classification")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_classification_fp16(self):
+        self.run_task_tests(task="image-classification", torch_dtype='float16')
+
     @is_pipeline_test
     @require_vision
     @require_timm
@@ -361,11 +412,24 @@ def test_pipeline_image_classification(self):
     def test_pipeline_image_segmentation(self):
         self.run_task_tests(task="image-segmentation")
 
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_image_segmentation_fp16(self):
+        self.run_task_tests(task="image-segmentation", torch_dtype='float16')
+
     @is_pipeline_test
     @require_vision
     def test_pipeline_image_to_text(self):
         self.run_task_tests(task="image-to-text")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_image_to_text_fp16(self):
+        self.run_task_tests(task="image-to-text", torch_dtype='float16')
+
     @is_pipeline_test
     @require_timm
     @require_vision
@@ -373,6 +437,13 @@ def test_pipeline_image_to_text(self):
     def test_pipeline_image_feature_extraction(self):
         self.run_task_tests(task="image-feature-extraction")
 
+    @is_pipeline_test
+    @require_timm
+    @require_vision
+    @require_torch
+    def test_pipeline_image_feature_extraction_fp16(self):
+        self.run_task_tests(task="image-feature-extraction", torch_dtype='float16')
+
     @unittest.skip(reason="`run_pipeline_test` is currently not implemented.")
     @is_pipeline_test
     @require_vision
@@ -387,44 +458,96 @@ def test_pipeline_mask_generation(self):
     def test_pipeline_object_detection(self):
         self.run_task_tests(task="object-detection")
 
+    @is_pipeline_test
+    @require_vision
+    @require_timm
+    @require_torch
+    def test_pipeline_object_detection_fp16(self):
+        self.run_task_tests(task="object-detection", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_question_answering(self):
         self.run_task_tests(task="question-answering")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_question_answering_fp16(self):
+        self.run_task_tests(task="question-answering", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_summarization(self):
         self.run_task_tests(task="summarization")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_summarization_fp16(self):
+        self.run_task_tests(task="summarization", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_table_question_answering(self):
         self.run_task_tests(task="table-question-answering")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_table_question_answering_fp16(self):
+        self.run_task_tests(task="table-question-answering", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_text2text_generation(self):
         self.run_task_tests(task="text2text-generation")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text2text_generation_fp16(self):
+        self.run_task_tests(task="text2text-generation", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_text_classification(self):
         self.run_task_tests(task="text-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text_classification_fp16(self):
+        self.run_task_tests(task="text-classification", torch_dtype='float16')
+
     @is_pipeline_test
     @require_torch_or_tf
     def test_pipeline_text_generation(self):
         self.run_task_tests(task="text-generation")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text_generation_fp16(self):
+        self.run_task_tests(task="text-generation", torch_dtype='float16')
+
     @is_pipeline_test
     @require_torch
     def test_pipeline_text_to_audio(self):
         self.run_task_tests(task="text-to-audio")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_text_to_audio_fp16(self):
+        self.run_task_tests(task="text-to-audio", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_token_classification(self):
         self.run_task_tests(task="token-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_token_classification_fp16(self):
+        self.run_task_tests(task="token-classification", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_translation(self):
         self.run_task_tests(task="translation")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_translation_fp16(self):
+        self.run_task_tests(task="translation", torch_dtype='float16')
+
     @is_pipeline_test
     @require_torch_or_tf
     @require_vision
@@ -432,35 +555,70 @@ def test_pipeline_translation(self):
     def test_pipeline_video_classification(self):
         self.run_task_tests(task="video-classification")
 
+    @is_pipeline_test
+    @require_vision
+    @require_decord
+    @require_torch
+    def test_pipeline_video_classification_fp16(self):
+        self.run_task_tests(task="video-classification", torch_dtype='float16')
+
     @is_pipeline_test
     @require_torch
     @require_vision
     def test_pipeline_visual_question_answering(self):
         self.run_task_tests(task="visual-question-answering")
 
+    @is_pipeline_test
+    @require_torch
+    @require_vision
+    def test_pipeline_visual_question_answering_fp16(self):
+        self.run_task_tests(task="visual-question-answering", torch_dtype='float16')
+
     @is_pipeline_test
     def test_pipeline_zero_shot(self):
         self.run_task_tests(task="zero-shot")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_zero_shot_fp16(self):
+        self.run_task_tests(task="zero-shot", torch_dtype='float16')
+
     @is_pipeline_test
     @require_torch
     def test_pipeline_zero_shot_audio_classification(self):
         self.run_task_tests(task="zero-shot-audio-classification")
 
+    @is_pipeline_test
+    @require_torch
+    def test_pipeline_zero_shot_audio_classification_fp16(self):
+        self.run_task_tests(task="zero-shot-audio-classification", torch_dtype='float16')
+
     @is_pipeline_test
     @require_vision
     def test_pipeline_zero_shot_image_classification(self):
         self.run_task_tests(task="zero-shot-image-classification")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_zero_shot_image_classification_fp16(self):
+        self.run_task_tests(task="zero-shot-image-classification", torch_dtype='float16')
+
     @is_pipeline_test
     @require_vision
     @require_torch
     def test_pipeline_zero_shot_object_detection(self):
         self.run_task_tests(task="zero-shot-object-detection")
 
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_zero_shot_object_detection_fp16(self):
+        self.run_task_tests(task="zero-shot-object-detection", torch_dtype='float16')
+
     # This contains the test cases to be skipped without model architecture being involved.
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name, torch_dtype
     ):
         """Skip some tests based on the classes or their names without the instantiated objects.
 
@@ -477,7 +635,7 @@ def is_pipeline_test_to_skip(
 
         return False
 
-    def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model, tokenizer, processor):  # noqa
+    def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model, tokenizer, processor, torch_dtype):  # noqa
         """Skip some more tests based on the information from the instantiated objects."""
         # No fix is required for this case.
         if (

From d01fcafff2c7e2406c743823316a56356f2e4282 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 15:45:32 +0800
Subject: [PATCH 09/14] Add fp16 placeholder for mask_generation pipeline test

---
 tests/test_pipeline_mixin.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 9f51bc3f2646..8f2aa1e30b22 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -451,6 +451,13 @@ def test_pipeline_image_feature_extraction_fp16(self):
     def test_pipeline_mask_generation(self):
         self.run_task_tests(task="mask-generation")
 
+    @unittest.skip(reason="`run_pipeline_test` is currently not implemented.")
+    @is_pipeline_test
+    @require_vision
+    @require_torch
+    def test_pipeline_mask_generation_fp16(self):
+        self.run_task_tests(task="mask-generation", torch_dtype='float16')
+
     @is_pipeline_test
     @require_vision
     @require_timm

From 74c652f58e90bd94395bce376e27ed9358cf8015 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 15:47:20 +0800
Subject: [PATCH 10/14] Add FP16 tests for all pipelines

---
 .../test_pipelines_audio_classification.py    |  4 +-
 ..._pipelines_automatic_speech_recognition.py |  4 +-
 .../test_pipelines_depth_estimation.py        |  4 +-
 ...t_pipelines_document_question_answering.py |  4 +-
 .../test_pipelines_feature_extraction.py      |  4 +-
 tests/pipelines/test_pipelines_fill_mask.py   |  4 +-
 .../test_pipelines_image_classification.py    |  4 +-
 ...test_pipelines_image_feature_extraction.py |  4 +-
 .../test_pipelines_image_segmentation.py      |  4 +-
 .../test_pipelines_image_to_image.py          | 10 ++++-
 .../pipelines/test_pipelines_image_to_text.py |  4 +-
 .../test_pipelines_mask_generation.py         |  4 +-
 .../test_pipelines_object_detection.py        |  4 +-
 .../test_pipelines_question_answering.py      |  4 +-
 .../pipelines/test_pipelines_summarization.py |  4 +-
 ...test_pipelines_table_question_answering.py | 40 +++++++++++++++----
 .../test_pipelines_text2text_generation.py    |  4 +-
 .../test_pipelines_text_classification.py     |  4 +-
 .../test_pipelines_text_generation.py         |  4 +-
 .../pipelines/test_pipelines_text_to_audio.py |  4 +-
 .../test_pipelines_token_classification.py    |  4 +-
 tests/pipelines/test_pipelines_translation.py |  6 +--
 .../test_pipelines_video_classification.py    |  4 +-
 ...est_pipelines_visual_question_answering.py |  4 +-
 tests/pipelines/test_pipelines_zero_shot.py   |  4 +-
 ...ipelines_zero_shot_audio_classification.py |  8 +++-
 ...ipelines_zero_shot_image_classification.py |  8 +++-
 ...st_pipelines_zero_shot_object_detection.py |  4 +-
 28 files changed, 101 insertions(+), 63 deletions(-)

diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index 57040a468be4..efa3edf0d0f9 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -35,8 +35,8 @@ class AudioClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=processor, torch_dtype=torch_dtype)
 
         # test with a raw waveform
         audio = np.zeros((34000,))
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 73376ff2189c..26da2004f3e4 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -66,7 +66,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
         + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         if tokenizer is None:
             # Side effect of no Fast Tokenizer class for these model, so skipping
             # But the slow tokenizer test should still run as they're quite small
@@ -75,7 +75,7 @@ def get_test_pipeline(self, model, tokenizer, processor):
             # return None, None
 
         speech_recognizer = AutomaticSpeechRecognitionPipeline(
-            model=model, tokenizer=tokenizer, feature_extractor=processor
+            model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype
         )
 
         # test with a raw waveform
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index abc58ca710b8..ffffd31be6c2 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -56,8 +56,8 @@ def hashimage(image: Image) -> str:
 class DepthEstimationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        depth_estimator = DepthEstimationPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        depth_estimator = DepthEstimationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return depth_estimator, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index 81febbc8c176..0d40ee5663e8 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -61,9 +61,9 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
 
     @require_pytesseract
     @require_vision
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         dqa_pipeline = pipeline(
-            "document-question-answering", model=model, tokenizer=tokenizer, image_processor=processor
+            "document-question-answering", model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype
         )
 
         image = INVOICE_URL
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index 87c5a151175c..042c0c62b942 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -174,7 +174,7 @@ def get_shape(self, input_, shape=None):
             raise ValueError("We expect lists of floats, nothing else")
         return shape
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         if tokenizer is None:
             self.skipTest("No tokenizer")
             return
@@ -195,7 +195,7 @@ def get_test_pipeline(self, model, tokenizer, processor):
             )
 
             return
-        feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor)
+        feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype)
         return feature_extractor, ["This is a test", "This is another test"]
 
     def run_pipeline_test(self, feature_extractor, examples):
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index bbf2b6cf3f43..fd38fadc6cbf 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -251,11 +251,11 @@ def test_model_no_pad_tf(self):
         unmasker.tokenizer.pad_token = None
         self.run_pipeline_test(unmasker, [])
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         if tokenizer is None or tokenizer.mask_token_id is None:
             self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
 
-        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
+        fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         examples = [
             f"This is another {tokenizer.mask_token} test",
         ]
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 9f6a8adfd106..5dc1a4c6dcf5 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -51,8 +51,8 @@ class ImageClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        image_classifier = ImageClassificationPipeline(model=model, image_processor=processor, top_k=2)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        image_classifier = ImageClassificationPipeline(model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype)
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "http://images.cocodataset.org/val2017/000000039769.jpg",
diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.py b/tests/pipelines/test_pipelines_image_feature_extraction.py
index 1519c7a97803..8f288ddf2be5 100644
--- a/tests/pipelines/test_pipelines_image_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.py
@@ -157,7 +157,7 @@ def test_return_tensors_tf(self):
         outputs = feature_extractor(img, return_tensors=True)
         self.assertTrue(tf.is_tensor(outputs))
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         if processor is None:
             self.skipTest("No image processor")
 
@@ -173,7 +173,7 @@ def get_test_pipeline(self, model, tokenizer, processor):
                 """
             )
 
-        feature_extractor = ImageFeatureExtractionPipeline(model=model, image_processor=processor)
+        feature_extractor = ImageFeatureExtractionPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         img = prepare_img()
         return feature_extractor, [img, img]
 
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index 6546df2a1b9e..dda31880a996 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -87,8 +87,8 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
         + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_image_to_image.py b/tests/pipelines/test_pipelines_image_to_image.py
index e9110bb69295..4612098a389c 100644
--- a/tests/pipelines/test_pipelines_image_to_image.py
+++ b/tests/pipelines/test_pipelines_image_to_image.py
@@ -54,9 +54,9 @@ class ImageToImagePipelineTests(unittest.TestCase):
     @require_torch
     @require_vision
     @slow
-    def test_pipeline(self):
+    def test_pipeline(self, torch_dtype='float32'):
         model_id = "caidas/swin2SR-classical-sr-x2-64"
-        upscaler = pipeline("image-to-image", model=model_id)
+        upscaler = pipeline("image-to-image", model=model_id, torch_dtype=torch_dtype)
         upscaled_list = upscaler(self.examples)
 
         self.assertEqual(len(upscaled_list), len(self.examples))
@@ -66,6 +66,12 @@ def test_pipeline(self):
         self.assertEqual(upscaled_list[0].size, (1296, 976))
         self.assertEqual(upscaled_list[1].size, (1296, 976))
 
+    @require_torch
+    @require_vision
+    @slow
+    def test_pipeline_fp16(self):
+        self.test_pipeline(torch_dtype='float16')
+
     @require_torch
     @require_vision
     @slow
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index c77353a261f9..cef8626bbabd 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -45,8 +45,8 @@ class ImageToTextPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
     tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype)
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py
index c9a44a535483..520153efbe04 100644
--- a/tests/pipelines/test_pipelines_mask_generation.py
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -67,8 +67,8 @@ class MaskGenerationPipelineTests(unittest.TestCase):
         (list(TF_MODEL_FOR_MASK_GENERATION_MAPPING.items()) if TF_MODEL_FOR_MASK_GENERATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index ec4984b76f99..a219e5954eba 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -53,8 +53,8 @@ def open(*args, **kwargs):
 class ObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        object_detector = ObjectDetectionPipeline(model=model, image_processor=processor)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        object_detector = ObjectDetectionPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
 
     def run_pipeline_test(self, object_detector, examples):
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index f7683aec15c3..72a484e8ceec 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -50,12 +50,12 @@ class QAPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         if isinstance(model.config, LxmertConfig):
             # This is an bimodal model, we need to find a more consistent way
             # to switch on those models.
             return None, None
-        question_answerer = QuestionAnsweringPipeline(model, tokenizer)
+        question_answerer = QuestionAnsweringPipeline(model, tokenizer, torch_dtype=torch_dtype)
 
         examples = [
             {"question": "Where was HuggingFace founded ?", "context": "HuggingFace was founded in Paris."},
diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py
index 8d745c376d84..70eae100e2a8 100644
--- a/tests/pipelines/test_pipelines_summarization.py
+++ b/tests/pipelines/test_pipelines_summarization.py
@@ -32,8 +32,8 @@ class SummarizationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
 
     def run_pipeline_test(self, summarizer, _):
diff --git a/tests/pipelines/test_pipelines_table_question_answering.py b/tests/pipelines/test_pipelines_table_question_answering.py
index a30763fc096d..54e4b9c4e6fd 100644
--- a/tests/pipelines/test_pipelines_table_question_answering.py
+++ b/tests/pipelines/test_pipelines_table_question_answering.py
@@ -152,9 +152,9 @@ def test_small_model_tf(self):
 
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
-    def test_small_model_pt(self):
+    def test_small_model_pt(self, torch_dtype='float32'):
         model_id = "lysandre/tiny-tapas-random-wtq"
-        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
+        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, torch_dtype=torch_dtype)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         self.assertIsInstance(model.config.aggregation_labels, dict)
         self.assertIsInstance(model.config.no_aggregation_label_index, int)
@@ -255,9 +255,14 @@ def test_small_model_pt(self):
 
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
-    def test_slow_tokenizer_sqa_pt(self):
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(torch_dtype="float16")
+
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @require_torch
+    def test_slow_tokenizer_sqa_pt(self, torch_dtype='float32'):
         model_id = "lysandre/tiny-tapas-random-sqa"
-        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
+        model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, torch_dtype=torch_dtype)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
 
@@ -373,6 +378,11 @@ def test_slow_tokenizer_sqa_pt(self):
                 },
             )
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @require_torch
+    def test_slow_tokenizer_sqa_pt_fp16(self):
+        self.test_slow_tokenizer_sqa_pt(torch_dtype="float16")
+
     @require_tf
     @require_tensorflow_probability
     @require_pandas
@@ -498,8 +508,8 @@ def test_slow_tokenizer_sqa_tf(self):
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
-    def test_integration_wtq_pt(self):
-        table_querier = pipeline("table-question-answering")
+    def test_integration_wtq_pt(self, torch_dtype='float32'):
+        table_querier = pipeline("table-question-answering", torch_dtype=torch_dtype)
 
         data = {
             "Repository": ["Transformers", "Datasets", "Tokenizers"],
@@ -541,6 +551,12 @@ def test_integration_wtq_pt(self):
         ]
         self.assertListEqual(results, expected_results)
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @slow
+    @require_torch
+    def test_integration_wtq_pt_fp16(self):
+        self.test_integration_wtq_pt(torch_dtype="float16")
+
     @slow
     @require_tensorflow_probability
     @require_pandas
@@ -593,11 +609,12 @@ def test_integration_wtq_tf(self):
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
-    def test_integration_sqa_pt(self):
+    def test_integration_sqa_pt(self, torch_dtype='float32'):
         table_querier = pipeline(
             "table-question-answering",
             model="google/tapas-base-finetuned-sqa",
             tokenizer="google/tapas-base-finetuned-sqa",
+            torch_dtype=torch_dtype,
         )
         data = {
             "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
@@ -615,6 +632,12 @@ def test_integration_sqa_pt(self):
         ]
         self.assertListEqual(results, expected_results)
 
+    @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
+    @slow
+    @require_torch
+    def test_integration_sqa_pt_fp16(self):
+        self.test_integration_sqa_pt(torch_dtype="float16")
+
     @slow
     @require_tensorflow_probability
     @require_pandas
@@ -645,11 +668,12 @@ def test_integration_sqa_tf(self):
 
     @slow
     @require_torch
-    def test_large_model_pt_tapex(self):
+    def test_large_model_pt_tapex(self, torch_dtype='float32'):
         model_id = "microsoft/tapex-large-finetuned-wtq"
         table_querier = pipeline(
             "table-question-answering",
             model=model_id,
+            torch_dtype=torch_dtype,
         )
         data = {
             "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
diff --git a/tests/pipelines/test_pipelines_text2text_generation.py b/tests/pipelines/test_pipelines_text2text_generation.py
index eccae9850b3b..5349bdcba5d5 100644
--- a/tests/pipelines/test_pipelines_text2text_generation.py
+++ b/tests/pipelines/test_pipelines_text2text_generation.py
@@ -35,8 +35,8 @@ class Text2TextGenerationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return generator, ["Something to write", "Something else"]
 
     def run_pipeline_test(self, generator, _):
diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py
index 63adfc45a029..1643886c86b6 100644
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -179,8 +179,8 @@ def test_tf_bert(self):
         outputs = text_classifier("Birds are a type of animal")
         self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return text_classifier, ["HuggingFace is in", "This is another test"]
 
     def run_pipeline_test(self, text_classifier, _):
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 4c91fd46cd97..491ed5b22161 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -320,8 +320,8 @@ def test_small_chat_model_tf(self):
             ],
         )
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return text_generator, ["This is a test", "Another test"]
 
     def test_stop_sequence_stopping_criteria(self):
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
index b780d26d79a4..608e412cf960 100644
--- a/tests/pipelines/test_pipelines_text_to_audio.py
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -250,8 +250,8 @@ def test_generative_model_kwargs(self):
         outputs = music_generator("This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs)
         self.assertListEqual(outputs["audio"].tolist(), audio.tolist())
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        speech_generator = TextToAudioPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        speech_generator = TextToAudioPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return speech_generator, ["This is a test", "Another test"]
 
     def run_pipeline_test(self, speech_generator, _):
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index eda9ac014bf7..bb2585ff0750 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -56,8 +56,8 @@ class TokenClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
 
     def run_pipeline_test(self, token_classifier, _):
diff --git a/tests/pipelines/test_pipelines_translation.py b/tests/pipelines/test_pipelines_translation.py
index 61d390fe76eb..9a49c655309d 100644
--- a/tests/pipelines/test_pipelines_translation.py
+++ b/tests/pipelines/test_pipelines_translation.py
@@ -35,12 +35,12 @@ class TranslationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         if isinstance(model.config, MBartConfig):
             src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
-            translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang)
+            translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, torch_dtype=torch_dtype)
         else:
-            translator = TranslationPipeline(model=model, tokenizer=tokenizer)
+            translator = TranslationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return translator, ["Some string", "Some other text"]
 
     def run_pipeline_test(self, translator, _):
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index d23916bad84f..06f82bc8b373 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -38,11 +38,11 @@
 class VideoClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         example_video_filepath = hf_hub_download(
             repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
         )
-        video_classifier = VideoClassificationPipeline(model=model, image_processor=processor, top_k=2)
+        video_classifier = VideoClassificationPipeline(model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype)
         examples = [
             example_video_filepath,
             "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
index 776046e160c4..d04566f860ac 100644
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -55,8 +55,8 @@ def open(*args, **kwargs):
 class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
-        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa")
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa", torch_dtype=torch_dtype)
         examples = [
             {
                 "image": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
diff --git a/tests/pipelines/test_pipelines_zero_shot.py b/tests/pipelines/test_pipelines_zero_shot.py
index 2e61d97c1dc8..4559bdd26b91 100644
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -42,9 +42,9 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         classifier = ZeroShotClassificationPipeline(
-            model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"]
+            model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"], torch_dtype=torch_dtype
         )
         return classifier, ["Who are you voting for in 2020?", "My stomach hurts."]
 
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 09b2f56f9802..457f8379d574 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -28,9 +28,9 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
     # model_mapping = {CLAPConfig: CLAPModel}
 
     @require_torch
-    def test_small_model_pt(self):
+    def test_small_model_pt(self, torch_dtype='float32'):
         audio_classifier = pipeline(
-            task="zero-shot-audio-classification", model="hf-internal-testing/tiny-clap-htsat-unfused"
+            task="zero-shot-audio-classification", model="hf-internal-testing/tiny-clap-htsat-unfused", torch_dtype=torch_dtype
         )
         dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         audio = dataset["train"]["audio"][-1]["array"]
@@ -40,6 +40,10 @@ def test_small_model_pt(self):
             [{"score": 0.501, "label": "Sound of a dog"}, {"score": 0.499, "label": "Sound of vaccum cleaner"}],
         )
 
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(torch_dtype='float16')
+
     @unittest.skip("No models are available in TF")
     def test_small_model_tf(self):
         pass
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index 7adae8ee962a..16334608e8d2 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -71,9 +71,9 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
     #     outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
 
     @require_torch
-    def test_small_model_pt(self):
+    def test_small_model_pt(self, torch_dtype='float32'):
         image_classifier = pipeline(
-            model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification",
+            model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", torch_dtype=torch_dtype
         )
         image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
         output = image_classifier(image, candidate_labels=["a", "b", "c"])
@@ -127,6 +127,10 @@ def test_small_model_pt(self):
             ],
         )
 
+    @require_torch
+    def test_small_model_pt_fp16(self):
+        self.test_small_model_pt(torch_dtype='float16')
+
     @require_tf
     def test_small_model_tf(self):
         image_classifier = pipeline(
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index c8b424483fa2..c2d960404843 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -43,9 +43,9 @@ def open(*args, **kwargs):
 class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
         object_detector = pipeline(
-            "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection"
+            "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection", torch_dtype=torch_dtype
         )
 
         examples = [

From c792b08c0fd25a322186c40c9feaeb3f655ba492 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 15:53:36 +0800
Subject: [PATCH 11/14] Fix formatting

---
 .../pipelines/depth_estimation.py             |  2 +-
 .../pipelines/document_question_answering.py  |  2 +-
 .../pipelines/image_classification.py         |  2 +-
 .../pipelines/image_feature_extraction.py     |  3 +-
 .../pipelines/image_segmentation.py           |  4 +-
 src/transformers/pipelines/image_to_image.py  |  2 +-
 src/transformers/pipelines/image_to_text.py   |  8 +-
 src/transformers/pipelines/mask_generation.py |  2 +-
 .../pipelines/object_detection.py             |  2 +-
 .../pipelines/video_classification.py         |  2 +-
 .../pipelines/visual_question_answering.py    |  2 +-
 .../zero_shot_image_classification.py         |  2 +-
 .../pipelines/zero_shot_object_detection.py   |  2 +-
 .../test_pipelines_audio_classification.py    |  6 +-
 ..._pipelines_automatic_speech_recognition.py |  2 +-
 .../test_pipelines_depth_estimation.py        |  2 +-
 ...t_pipelines_document_question_answering.py |  8 +-
 .../test_pipelines_feature_extraction.py      |  6 +-
 tests/pipelines/test_pipelines_fill_mask.py   |  2 +-
 .../test_pipelines_image_classification.py    |  6 +-
 ...test_pipelines_image_feature_extraction.py |  6 +-
 .../test_pipelines_image_segmentation.py      |  2 +-
 .../test_pipelines_image_to_image.py          |  4 +-
 .../pipelines/test_pipelines_image_to_text.py |  6 +-
 .../test_pipelines_mask_generation.py         |  2 +-
 .../test_pipelines_object_detection.py        |  2 +-
 .../test_pipelines_question_answering.py      |  2 +-
 .../pipelines/test_pipelines_summarization.py |  2 +-
 ...test_pipelines_table_question_answering.py | 10 +--
 .../test_pipelines_text2text_generation.py    |  2 +-
 .../test_pipelines_text_classification.py     |  2 +-
 .../test_pipelines_text_generation.py         |  2 +-
 .../pipelines/test_pipelines_text_to_audio.py |  2 +-
 .../test_pipelines_token_classification.py    |  2 +-
 tests/pipelines/test_pipelines_translation.py |  6 +-
 .../test_pipelines_video_classification.py    |  6 +-
 ...est_pipelines_visual_question_answering.py |  6 +-
 tests/pipelines/test_pipelines_zero_shot.py   |  2 +-
 ...ipelines_zero_shot_audio_classification.py |  8 +-
 ...ipelines_zero_shot_image_classification.py |  4 +-
 ...st_pipelines_zero_shot_object_detection.py |  6 +-
 tests/test_pipeline_mixin.py                  | 78 +++++++++++--------
 42 files changed, 132 insertions(+), 97 deletions(-)

diff --git a/src/transformers/pipelines/depth_estimation.py b/src/transformers/pipelines/depth_estimation.py
index c27d6e2b7724..79a85008e7cf 100644
--- a/src/transformers/pipelines/depth_estimation.py
+++ b/src/transformers/pipelines/depth_estimation.py
@@ -91,7 +91,7 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout)
         self.image_size = image.size
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
-        if self.framework == 'pt':
+        if self.framework == "pt":
             model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
diff --git a/src/transformers/pipelines/document_question_answering.py b/src/transformers/pipelines/document_question_answering.py
index f97f9e098d5d..c840c14a7191 100644
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@@ -295,7 +295,7 @@ def preprocess(
             image = load_image(input["image"], timeout=timeout)
             if self.image_processor is not None:
                 image_inputs = self.image_processor(images=image, return_tensors=self.framework)
-                if self.framework == 'pt':
+                if self.framework == "pt":
                     image_inputs = image_inputs.to(self.torch_dtype)
                 image_features.update(image_inputs)
             elif self.feature_extractor is not None:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 49e6574bc96a..61ee122a5e54 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -159,7 +159,7 @@ def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Imag
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(images=image, return_tensors=self.framework)
-        if self.framework == 'pt':
+        if self.framework == "pt":
             model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
diff --git a/src/transformers/pipelines/image_feature_extraction.py b/src/transformers/pipelines/image_feature_extraction.py
index 8e0b8b0151c1..391eb2b3aec7 100644
--- a/src/transformers/pipelines/image_feature_extraction.py
+++ b/src/transformers/pipelines/image_feature_extraction.py
@@ -7,6 +7,7 @@
 if is_vision_available():
     from ..image_utils import load_image
 
+
 @add_end_docstrings(
     build_pipeline_init_args(has_image_processor=True),
     """
@@ -59,7 +60,7 @@ def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None,
     def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
         image = load_image(image, timeout=timeout)
         model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
-        if self.framework == 'pt':
+        if self.framework == "pt":
             model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index 12e9353c9ba1..e0fd3b7d85ab 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -147,7 +147,7 @@ def preprocess(self, image, subtask=None, timeout=None):
             else:
                 kwargs = {"task_inputs": [subtask]}
             inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
-            if self.framework == 'pt':
+            if self.framework == "pt":
                 inputs = inputs.to(self.torch_dtype)
             inputs["task_inputs"] = self.tokenizer(
                 inputs["task_inputs"],
@@ -157,7 +157,7 @@ def preprocess(self, image, subtask=None, timeout=None):
             )["input_ids"]
         else:
             inputs = self.image_processor(images=[image], return_tensors="pt")
-            if self.framework == 'pt':
+            if self.framework == "pt":
                 inputs = inputs.to(self.torch_dtype)
         inputs["target_size"] = target_size
         return inputs
diff --git a/src/transformers/pipelines/image_to_image.py b/src/transformers/pipelines/image_to_image.py
index 5c2acfe9eb30..cb66359a4ddd 100644
--- a/src/transformers/pipelines/image_to_image.py
+++ b/src/transformers/pipelines/image_to_image.py
@@ -119,7 +119,7 @@ def _forward(self, model_inputs):
     def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors="pt")
-        if self.framework == 'pt':
+        if self.framework == "pt":
             inputs = inputs.to(self.torch_dtype)
         return inputs
 
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 8cc76c37e39e..88dce8e591ae 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -138,7 +138,7 @@ def preprocess(self, image, prompt=None, timeout=None):
 
             if model_type == "git":
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
-                if self.framework == 'pt':
+                if self.framework == "pt":
                     model_inputs = model_inputs.to(self.torch_dtype)
                 input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
                 input_ids = [self.tokenizer.cls_token_id] + input_ids
@@ -147,13 +147,13 @@ def preprocess(self, image, prompt=None, timeout=None):
 
             elif model_type == "pix2struct":
                 model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
-                if self.framework == 'pt':
+                if self.framework == "pt":
                     model_inputs = model_inputs.to(self.torch_dtype)
 
             elif model_type != "vision-encoder-decoder":
                 # vision-encoder-decoder does not support conditional generation
                 model_inputs = self.image_processor(images=image, return_tensors=self.framework)
-                if self.framework == 'pt':
+                if self.framework == "pt":
                     model_inputs = model_inputs.to(self.torch_dtype)
                 text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
                 model_inputs.update(text_inputs)
@@ -163,7 +163,7 @@ def preprocess(self, image, prompt=None, timeout=None):
 
         else:
             model_inputs = self.image_processor(images=image, return_tensors=self.framework)
-            if self.framework == 'pt':
+            if self.framework == "pt":
                 model_inputs = model_inputs.to(self.torch_dtype)
 
         if self.model.config.model_type == "git" and prompt is None:
diff --git a/src/transformers/pipelines/mask_generation.py b/src/transformers/pipelines/mask_generation.py
index 32d66a6924a4..f87e45b7f8ec 100644
--- a/src/transformers/pipelines/mask_generation.py
+++ b/src/transformers/pipelines/mask_generation.py
@@ -181,7 +181,7 @@ def preprocess(
             image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
         )
         model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
-        if self.framework == 'pt':
+        if self.framework == "pt":
             model_inputs = model_inputs.to(self.torch_dtype)
 
         with self.device_placement():
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 7536172ce8b8..d3e2135790ff 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -107,7 +107,7 @@ def preprocess(self, image, timeout=None):
         image = load_image(image, timeout=timeout)
         target_size = torch.IntTensor([[image.height, image.width]])
         inputs = self.image_processor(images=[image], return_tensors="pt")
-        if self.framework == 'pt':
+        if self.framework == "pt":
             inputs = inputs.to(self.torch_dtype)
         if self.tokenizer is not None:
             inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index c08a3bd3ab8e..68ea928bce56 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -106,7 +106,7 @@ def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
         video = list(video)
 
         model_inputs = self.image_processor(video, return_tensors=self.framework)
-        if self.framework == 'pt':
+        if self.framework == "pt":
             model_inputs = model_inputs.to(self.torch_dtype)
         return model_inputs
 
diff --git a/src/transformers/pipelines/visual_question_answering.py b/src/transformers/pipelines/visual_question_answering.py
index 1ea279e2e81a..e5849cbdec19 100644
--- a/src/transformers/pipelines/visual_question_answering.py
+++ b/src/transformers/pipelines/visual_question_answering.py
@@ -155,7 +155,7 @@ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
             truncation=truncation,
         )
         image_features = self.image_processor(images=image, return_tensors=self.framework)
-        if self.framework == 'pt':
+        if self.framework == "pt":
             image_features = image_features.to(self.torch_dtype)
         model_inputs.update(image_features)
         return model_inputs
diff --git a/src/transformers/pipelines/zero_shot_image_classification.py b/src/transformers/pipelines/zero_shot_image_classification.py
index de453aa52f2a..b0ceba8cbe67 100644
--- a/src/transformers/pipelines/zero_shot_image_classification.py
+++ b/src/transformers/pipelines/zero_shot_image_classification.py
@@ -120,7 +120,7 @@ def _sanitize_parameters(self, **kwargs):
     def preprocess(self, image, candidate_labels=None, hypothesis_template="This is a photo of {}.", timeout=None):
         image = load_image(image, timeout=timeout)
         inputs = self.image_processor(images=[image], return_tensors=self.framework)
-        if self.framework == 'pt':
+        if self.framework == "pt":
             inputs = inputs.to(self.torch_dtype)
         inputs["candidate_labels"] = candidate_labels
         sequences = [hypothesis_template.format(x) for x in candidate_labels]
diff --git a/src/transformers/pipelines/zero_shot_object_detection.py b/src/transformers/pipelines/zero_shot_object_detection.py
index 52a108c627eb..9ad575202266 100644
--- a/src/transformers/pipelines/zero_shot_object_detection.py
+++ b/src/transformers/pipelines/zero_shot_object_detection.py
@@ -156,7 +156,7 @@ def preprocess(self, inputs, timeout=None):
         for i, candidate_label in enumerate(candidate_labels):
             text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
             image_features = self.image_processor(image, return_tensors=self.framework)
-            if self.framework == 'pt':
+            if self.framework == "pt":
                 image_features = image_features.to(self.torch_dtype)
             yield {
                 "is_last": i == len(candidate_labels) - 1,
diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py
index efa3edf0d0f9..f699a50c67e1 100644
--- a/tests/pipelines/test_pipelines_audio_classification.py
+++ b/tests/pipelines/test_pipelines_audio_classification.py
@@ -35,8 +35,10 @@ class AudioClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
-        audio_classifier = AudioClassificationPipeline(model=model, feature_extractor=processor, torch_dtype=torch_dtype)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        audio_classifier = AudioClassificationPipeline(
+            model=model, feature_extractor=processor, torch_dtype=torch_dtype
+        )
 
         # test with a raw waveform
         audio = np.zeros((34000,))
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 26da2004f3e4..aa02f1c9c688 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -66,7 +66,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
         + (MODEL_FOR_CTC_MAPPING.items() if MODEL_FOR_CTC_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if tokenizer is None:
             # Side effect of no Fast Tokenizer class for these model, so skipping
             # But the slow tokenizer test should still run as they're quite small
diff --git a/tests/pipelines/test_pipelines_depth_estimation.py b/tests/pipelines/test_pipelines_depth_estimation.py
index ffffd31be6c2..d8d7b5749cf0 100644
--- a/tests/pipelines/test_pipelines_depth_estimation.py
+++ b/tests/pipelines/test_pipelines_depth_estimation.py
@@ -56,7 +56,7 @@ def hashimage(image: Image) -> str:
 class DepthEstimationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         depth_estimator = DepthEstimationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return depth_estimator, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_document_question_answering.py b/tests/pipelines/test_pipelines_document_question_answering.py
index 0d40ee5663e8..f3e9ca9217e2 100644
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@@ -61,9 +61,13 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase):
 
     @require_pytesseract
     @require_vision
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         dqa_pipeline = pipeline(
-            "document-question-answering", model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype
+            "document-question-answering",
+            model=model,
+            tokenizer=tokenizer,
+            image_processor=processor,
+            torch_dtype=torch_dtype,
         )
 
         image = INVOICE_URL
diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index 042c0c62b942..f3d5296a7776 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -174,7 +174,7 @@ def get_shape(self, input_, shape=None):
             raise ValueError("We expect lists of floats, nothing else")
         return shape
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if tokenizer is None:
             self.skipTest("No tokenizer")
             return
@@ -195,7 +195,9 @@ def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
             )
 
             return
-        feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype)
+        feature_extractor = FeatureExtractionPipeline(
+            model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype
+        )
         return feature_extractor, ["This is a test", "This is another test"]
 
     def run_pipeline_test(self, feature_extractor, examples):
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index fd38fadc6cbf..2e622334698b 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -251,7 +251,7 @@ def test_model_no_pad_tf(self):
         unmasker.tokenizer.pad_token = None
         self.run_pipeline_test(unmasker, [])
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if tokenizer is None or tokenizer.mask_token_id is None:
             self.skipTest("The provided tokenizer has no mask token, (probably reformer or wav2vec2)")
 
diff --git a/tests/pipelines/test_pipelines_image_classification.py b/tests/pipelines/test_pipelines_image_classification.py
index 5dc1a4c6dcf5..50a71be38cad 100644
--- a/tests/pipelines/test_pipelines_image_classification.py
+++ b/tests/pipelines/test_pipelines_image_classification.py
@@ -51,8 +51,10 @@ class ImageClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
     tf_model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
-        image_classifier = ImageClassificationPipeline(model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        image_classifier = ImageClassificationPipeline(
+            model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype
+        )
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "http://images.cocodataset.org/val2017/000000039769.jpg",
diff --git a/tests/pipelines/test_pipelines_image_feature_extraction.py b/tests/pipelines/test_pipelines_image_feature_extraction.py
index 8f288ddf2be5..f4ee9afcd145 100644
--- a/tests/pipelines/test_pipelines_image_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_image_feature_extraction.py
@@ -157,7 +157,7 @@ def test_return_tensors_tf(self):
         outputs = feature_extractor(img, return_tensors=True)
         self.assertTrue(tf.is_tensor(outputs))
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if processor is None:
             self.skipTest("No image processor")
 
@@ -173,7 +173,9 @@ def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
                 """
             )
 
-        feature_extractor = ImageFeatureExtractionPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
+        feature_extractor = ImageFeatureExtractionPipeline(
+            model=model, image_processor=processor, torch_dtype=torch_dtype
+        )
         img = prepare_img()
         return feature_extractor, [img, img]
 
diff --git a/tests/pipelines/test_pipelines_image_segmentation.py b/tests/pipelines/test_pipelines_image_segmentation.py
index dda31880a996..af1fdda8de63 100644
--- a/tests/pipelines/test_pipelines_image_segmentation.py
+++ b/tests/pipelines/test_pipelines_image_segmentation.py
@@ -87,7 +87,7 @@ class ImageSegmentationPipelineTests(unittest.TestCase):
         + (MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() if MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         image_segmenter = ImageSegmentationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_image_to_image.py b/tests/pipelines/test_pipelines_image_to_image.py
index 4612098a389c..29d590a8e34c 100644
--- a/tests/pipelines/test_pipelines_image_to_image.py
+++ b/tests/pipelines/test_pipelines_image_to_image.py
@@ -54,7 +54,7 @@ class ImageToImagePipelineTests(unittest.TestCase):
     @require_torch
     @require_vision
     @slow
-    def test_pipeline(self, torch_dtype='float32'):
+    def test_pipeline(self, torch_dtype="float32"):
         model_id = "caidas/swin2SR-classical-sr-x2-64"
         upscaler = pipeline("image-to-image", model=model_id, torch_dtype=torch_dtype)
         upscaled_list = upscaler(self.examples)
@@ -70,7 +70,7 @@ def test_pipeline(self, torch_dtype='float32'):
     @require_vision
     @slow
     def test_pipeline_fp16(self):
-        self.test_pipeline(torch_dtype='float16')
+        self.test_pipeline(torch_dtype="float16")
 
     @require_torch
     @require_vision
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index cef8626bbabd..6d6c11a59c10 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -45,8 +45,10 @@ class ImageToTextPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
     tf_model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
-        pipe = pipeline("image-to-text", model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        pipe = pipeline(
+            "image-to-text", model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype
+        )
         examples = [
             Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_mask_generation.py b/tests/pipelines/test_pipelines_mask_generation.py
index 520153efbe04..328582c2ba8f 100644
--- a/tests/pipelines/test_pipelines_mask_generation.py
+++ b/tests/pipelines/test_pipelines_mask_generation.py
@@ -67,7 +67,7 @@ class MaskGenerationPipelineTests(unittest.TestCase):
         (list(TF_MODEL_FOR_MASK_GENERATION_MAPPING.items()) if TF_MODEL_FOR_MASK_GENERATION_MAPPING else [])
     )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         image_segmenter = MaskGenerationPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return image_segmenter, [
             "./tests/fixtures/tests_samples/COCO/000000039769.png",
diff --git a/tests/pipelines/test_pipelines_object_detection.py b/tests/pipelines/test_pipelines_object_detection.py
index a219e5954eba..d0057e29d0b4 100644
--- a/tests/pipelines/test_pipelines_object_detection.py
+++ b/tests/pipelines/test_pipelines_object_detection.py
@@ -53,7 +53,7 @@ def open(*args, **kwargs):
 class ObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         object_detector = ObjectDetectionPipeline(model=model, image_processor=processor, torch_dtype=torch_dtype)
         return object_detector, ["./tests/fixtures/tests_samples/COCO/000000039769.png"]
 
diff --git a/tests/pipelines/test_pipelines_question_answering.py b/tests/pipelines/test_pipelines_question_answering.py
index 72a484e8ceec..8b68989600ee 100644
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -50,7 +50,7 @@ class QAPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if isinstance(model.config, LxmertConfig):
             # This is an bimodal model, we need to find a more consistent way
             # to switch on those models.
diff --git a/tests/pipelines/test_pipelines_summarization.py b/tests/pipelines/test_pipelines_summarization.py
index 70eae100e2a8..fb1dce0ca384 100644
--- a/tests/pipelines/test_pipelines_summarization.py
+++ b/tests/pipelines/test_pipelines_summarization.py
@@ -32,7 +32,7 @@ class SummarizationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return summarizer, ["(CNN)The Palestinian Authority officially became", "Some other text"]
 
diff --git a/tests/pipelines/test_pipelines_table_question_answering.py b/tests/pipelines/test_pipelines_table_question_answering.py
index 54e4b9c4e6fd..9481ab200063 100644
--- a/tests/pipelines/test_pipelines_table_question_answering.py
+++ b/tests/pipelines/test_pipelines_table_question_answering.py
@@ -152,7 +152,7 @@ def test_small_model_tf(self):
 
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
-    def test_small_model_pt(self, torch_dtype='float32'):
+    def test_small_model_pt(self, torch_dtype="float32"):
         model_id = "lysandre/tiny-tapas-random-wtq"
         model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, torch_dtype=torch_dtype)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -260,7 +260,7 @@ def test_small_model_pt_fp16(self):
 
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @require_torch
-    def test_slow_tokenizer_sqa_pt(self, torch_dtype='float32'):
+    def test_slow_tokenizer_sqa_pt(self, torch_dtype="float32"):
         model_id = "lysandre/tiny-tapas-random-sqa"
         model = AutoModelForTableQuestionAnswering.from_pretrained(model_id, torch_dtype=torch_dtype)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -508,7 +508,7 @@ def test_slow_tokenizer_sqa_tf(self):
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
-    def test_integration_wtq_pt(self, torch_dtype='float32'):
+    def test_integration_wtq_pt(self, torch_dtype="float32"):
         table_querier = pipeline("table-question-answering", torch_dtype=torch_dtype)
 
         data = {
@@ -609,7 +609,7 @@ def test_integration_wtq_tf(self):
     @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+")
     @slow
     @require_torch
-    def test_integration_sqa_pt(self, torch_dtype='float32'):
+    def test_integration_sqa_pt(self, torch_dtype="float32"):
         table_querier = pipeline(
             "table-question-answering",
             model="google/tapas-base-finetuned-sqa",
@@ -668,7 +668,7 @@ def test_integration_sqa_tf(self):
 
     @slow
     @require_torch
-    def test_large_model_pt_tapex(self, torch_dtype='float32'):
+    def test_large_model_pt_tapex(self, torch_dtype="float32"):
         model_id = "microsoft/tapex-large-finetuned-wtq"
         table_querier = pipeline(
             "table-question-answering",
diff --git a/tests/pipelines/test_pipelines_text2text_generation.py b/tests/pipelines/test_pipelines_text2text_generation.py
index 5349bdcba5d5..52fb59edd364 100644
--- a/tests/pipelines/test_pipelines_text2text_generation.py
+++ b/tests/pipelines/test_pipelines_text2text_generation.py
@@ -35,7 +35,7 @@ class Text2TextGenerationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         generator = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return generator, ["Something to write", "Something else"]
 
diff --git a/tests/pipelines/test_pipelines_text_classification.py b/tests/pipelines/test_pipelines_text_classification.py
index 1643886c86b6..4956cb8aed13 100644
--- a/tests/pipelines/test_pipelines_text_classification.py
+++ b/tests/pipelines/test_pipelines_text_classification.py
@@ -179,7 +179,7 @@ def test_tf_bert(self):
         outputs = text_classifier("Birds are a type of animal")
         self.assertEqual(nested_simplify(outputs), [{"label": "POSITIVE", "score": 0.988}])
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         text_classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return text_classifier, ["HuggingFace is in", "This is another test"]
 
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index 491ed5b22161..844e79f4f839 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -320,7 +320,7 @@ def test_small_chat_model_tf(self):
             ],
         )
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return text_generator, ["This is a test", "Another test"]
 
diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py
index 608e412cf960..655fe5961b52 100644
--- a/tests/pipelines/test_pipelines_text_to_audio.py
+++ b/tests/pipelines/test_pipelines_text_to_audio.py
@@ -250,7 +250,7 @@ def test_generative_model_kwargs(self):
         outputs = music_generator("This is a test", forward_params=forward_params, generate_kwargs=generate_kwargs)
         self.assertListEqual(outputs["audio"].tolist(), audio.tolist())
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         speech_generator = TextToAudioPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return speech_generator, ["This is a test", "Another test"]
 
diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py
index bb2585ff0750..41415c8c3458 100644
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -56,7 +56,7 @@ class TokenClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         token_classifier = TokenClassificationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return token_classifier, ["A simple string", "A simple string that is quite a bit longer"]
 
diff --git a/tests/pipelines/test_pipelines_translation.py b/tests/pipelines/test_pipelines_translation.py
index 9a49c655309d..c31ba49e7660 100644
--- a/tests/pipelines/test_pipelines_translation.py
+++ b/tests/pipelines/test_pipelines_translation.py
@@ -35,10 +35,12 @@ class TranslationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
     tf_model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         if isinstance(model.config, MBartConfig):
             src_lang, tgt_lang = list(tokenizer.lang_code_to_id.keys())[:2]
-            translator = TranslationPipeline(model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, torch_dtype=torch_dtype)
+            translator = TranslationPipeline(
+                model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, torch_dtype=torch_dtype
+            )
         else:
             translator = TranslationPipeline(model=model, tokenizer=tokenizer, torch_dtype=torch_dtype)
         return translator, ["Some string", "Some other text"]
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 06f82bc8b373..f4ea49da25c4 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -38,11 +38,13 @@
 class VideoClassificationPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         example_video_filepath = hf_hub_download(
             repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset"
         )
-        video_classifier = VideoClassificationPipeline(model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype)
+        video_classifier = VideoClassificationPipeline(
+            model=model, image_processor=processor, top_k=2, torch_dtype=torch_dtype
+        )
         examples = [
             example_video_filepath,
             "https://huggingface.co/datasets/nateraw/video-demo/resolve/main/archery.mp4",
diff --git a/tests/pipelines/test_pipelines_visual_question_answering.py b/tests/pipelines/test_pipelines_visual_question_answering.py
index d04566f860ac..b9ee24f3a4c4 100644
--- a/tests/pipelines/test_pipelines_visual_question_answering.py
+++ b/tests/pipelines/test_pipelines_visual_question_answering.py
@@ -55,8 +55,10 @@ def open(*args, **kwargs):
 class VisualQuestionAnsweringPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
-        vqa_pipeline = pipeline("visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa", torch_dtype=torch_dtype)
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
+        vqa_pipeline = pipeline(
+            "visual-question-answering", model="hf-internal-testing/tiny-vilt-random-vqa", torch_dtype=torch_dtype
+        )
         examples = [
             {
                 "image": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
diff --git a/tests/pipelines/test_pipelines_zero_shot.py b/tests/pipelines/test_pipelines_zero_shot.py
index 4559bdd26b91..1003898df6c9 100644
--- a/tests/pipelines/test_pipelines_zero_shot.py
+++ b/tests/pipelines/test_pipelines_zero_shot.py
@@ -42,7 +42,7 @@ class ZeroShotClassificationPipelineTests(unittest.TestCase):
             config: model for config, model in tf_model_mapping.items() if config.__name__ not in _TO_SKIP
         }
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         classifier = ZeroShotClassificationPipeline(
             model=model, tokenizer=tokenizer, candidate_labels=["polics", "health"], torch_dtype=torch_dtype
         )
diff --git a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
index 457f8379d574..a2cac59a1b74 100644
--- a/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_audio_classification.py
@@ -28,9 +28,11 @@ class ZeroShotAudioClassificationPipelineTests(unittest.TestCase):
     # model_mapping = {CLAPConfig: CLAPModel}
 
     @require_torch
-    def test_small_model_pt(self, torch_dtype='float32'):
+    def test_small_model_pt(self, torch_dtype="float32"):
         audio_classifier = pipeline(
-            task="zero-shot-audio-classification", model="hf-internal-testing/tiny-clap-htsat-unfused", torch_dtype=torch_dtype
+            task="zero-shot-audio-classification",
+            model="hf-internal-testing/tiny-clap-htsat-unfused",
+            torch_dtype=torch_dtype,
         )
         dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
         audio = dataset["train"]["audio"][-1]["array"]
@@ -42,7 +44,7 @@ def test_small_model_pt(self, torch_dtype='float32'):
 
     @require_torch
     def test_small_model_pt_fp16(self):
-        self.test_small_model_pt(torch_dtype='float16')
+        self.test_small_model_pt(torch_dtype="float16")
 
     @unittest.skip("No models are available in TF")
     def test_small_model_tf(self):
diff --git a/tests/pipelines/test_pipelines_zero_shot_image_classification.py b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
index 16334608e8d2..227411508496 100644
--- a/tests/pipelines/test_pipelines_zero_shot_image_classification.py
+++ b/tests/pipelines/test_pipelines_zero_shot_image_classification.py
@@ -71,7 +71,7 @@ class ZeroShotImageClassificationPipelineTests(unittest.TestCase):
     #     outputs = pipe([image] * 3, batch_size=2, candidate_labels=["A", "B"])
 
     @require_torch
-    def test_small_model_pt(self, torch_dtype='float32'):
+    def test_small_model_pt(self, torch_dtype="float32"):
         image_classifier = pipeline(
             model="hf-internal-testing/tiny-random-clip-zero-shot-image-classification", torch_dtype=torch_dtype
         )
@@ -129,7 +129,7 @@ def test_small_model_pt(self, torch_dtype='float32'):
 
     @require_torch
     def test_small_model_pt_fp16(self):
-        self.test_small_model_pt(torch_dtype='float16')
+        self.test_small_model_pt(torch_dtype="float16")
 
     @require_tf
     def test_small_model_tf(self):
diff --git a/tests/pipelines/test_pipelines_zero_shot_object_detection.py b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
index c2d960404843..01cd0e3454e0 100644
--- a/tests/pipelines/test_pipelines_zero_shot_object_detection.py
+++ b/tests/pipelines/test_pipelines_zero_shot_object_detection.py
@@ -43,9 +43,11 @@ def open(*args, **kwargs):
 class ZeroShotObjectDetectionPipelineTests(unittest.TestCase):
     model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING
 
-    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype='float32'):
+    def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
         object_detector = pipeline(
-            "zero-shot-object-detection", model="hf-internal-testing/tiny-random-owlvit-object-detection", torch_dtype=torch_dtype
+            "zero-shot-object-detection",
+            model="hf-internal-testing/tiny-random-owlvit-object-detection",
+            torch_dtype=torch_dtype,
         )
 
         examples = [
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 8f2aa1e30b22..acd8de480874 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -126,7 +126,7 @@ class PipelineTesterMixin:
     pipeline_model_mapping = None
     supported_frameworks = ["pt", "tf"]
 
-    def run_task_tests(self, task, torch_dtype='float32'):
+    def run_task_tests(self, task, torch_dtype="float32"):
         """Run pipeline tests for a specific `task`
 
         Args:
@@ -176,7 +176,9 @@ def run_task_tests(self, task, torch_dtype='float32'):
                 task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype
             )
 
-    def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype='float32'):
+    def run_model_pipeline_tests(
+        self, task, repo_name, model_architecture, tokenizer_names, processor_names, commit, torch_dtype="float32"
+    ):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class names
 
         Args:
@@ -207,7 +209,7 @@ def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenize
                     model_architecture,
                     tokenizer_name,
                     processor_name,
-                    torch_dtype
+                    torch_dtype,
                 ):
                     logger.warning(
                         f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
@@ -215,9 +217,13 @@ def run_model_pipeline_tests(self, task, repo_name, model_architecture, tokenize
                         f"`{tokenizer_name}` | processor `{processor_name}`."
                     )
                     continue
-                self.run_pipeline_test(task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype)
+                self.run_pipeline_test(
+                    task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype
+                )
 
-    def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype='float32'):
+    def run_pipeline_test(
+        self, task, repo_name, model_architecture, tokenizer_name, processor_name, commit, torch_dtype="float32"
+    ):
         """Run pipeline tests for a specific `task` with the give model class and tokenizer/processor class name
 
         The model will be loaded from a model repository on the Hub.
@@ -280,7 +286,9 @@ def run_pipeline_test(self, task, repo_name, model_architecture, tokenizer_name,
             return
 
         pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
-        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor, torch_dtype):
+        if self.is_pipeline_test_to_skip_more(
+            pipeline_test_class_name, model.config, model, tokenizer, processor, torch_dtype
+        ):
             logger.warning(
                 f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
                 f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
@@ -336,7 +344,7 @@ def test_pipeline_audio_classification(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_audio_classification_fp16(self):
-        self.run_task_tests(task="audio-classification", torch_dtype='float16')
+        self.run_task_tests(task="audio-classification", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_automatic_speech_recognition(self):
@@ -345,7 +353,7 @@ def test_pipeline_automatic_speech_recognition(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_automatic_speech_recognition_fp16(self):
-        self.run_task_tests(task="automatic-speech-recognition", torch_dtype='float16')
+        self.run_task_tests(task="automatic-speech-recognition", torch_dtype="float16")
 
     @is_pipeline_test
     @require_vision
@@ -359,7 +367,7 @@ def test_pipeline_depth_estimation(self):
     @require_timm
     @require_torch
     def test_pipeline_depth_estimation_fp16(self):
-        self.run_task_tests(task="depth-estimation", torch_dtype='float16')
+        self.run_task_tests(task="depth-estimation", torch_dtype="float16")
 
     @is_pipeline_test
     @require_pytesseract
@@ -373,7 +381,7 @@ def test_pipeline_document_question_answering(self):
     @require_torch
     @require_vision
     def test_pipeline_document_question_answering_fp16(self):
-        self.run_task_tests(task="document-question-answering", torch_dtype='float16')
+        self.run_task_tests(task="document-question-answering", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_feature_extraction(self):
@@ -382,7 +390,7 @@ def test_pipeline_feature_extraction(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_feature_extraction_fp16(self):
-        self.run_task_tests(task="feature-extraction", torch_dtype='float16')
+        self.run_task_tests(task="feature-extraction", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_fill_mask(self):
@@ -391,7 +399,7 @@ def test_pipeline_fill_mask(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_fill_mask_fp16(self):
-        self.run_task_tests(task="fill-mask", torch_dtype='float16')
+        self.run_task_tests(task="fill-mask", torch_dtype="float16")
 
     @is_pipeline_test
     @require_torch_or_tf
@@ -403,7 +411,7 @@ def test_pipeline_image_classification(self):
     @require_vision
     @require_torch
     def test_pipeline_image_classification_fp16(self):
-        self.run_task_tests(task="image-classification", torch_dtype='float16')
+        self.run_task_tests(task="image-classification", torch_dtype="float16")
 
     @is_pipeline_test
     @require_vision
@@ -417,7 +425,7 @@ def test_pipeline_image_segmentation(self):
     @require_timm
     @require_torch
     def test_pipeline_image_segmentation_fp16(self):
-        self.run_task_tests(task="image-segmentation", torch_dtype='float16')
+        self.run_task_tests(task="image-segmentation", torch_dtype="float16")
 
     @is_pipeline_test
     @require_vision
@@ -428,7 +436,7 @@ def test_pipeline_image_to_text(self):
     @require_vision
     @require_torch
     def test_pipeline_image_to_text_fp16(self):
-        self.run_task_tests(task="image-to-text", torch_dtype='float16')
+        self.run_task_tests(task="image-to-text", torch_dtype="float16")
 
     @is_pipeline_test
     @require_timm
@@ -442,7 +450,7 @@ def test_pipeline_image_feature_extraction(self):
     @require_vision
     @require_torch
     def test_pipeline_image_feature_extraction_fp16(self):
-        self.run_task_tests(task="image-feature-extraction", torch_dtype='float16')
+        self.run_task_tests(task="image-feature-extraction", torch_dtype="float16")
 
     @unittest.skip(reason="`run_pipeline_test` is currently not implemented.")
     @is_pipeline_test
@@ -456,7 +464,7 @@ def test_pipeline_mask_generation(self):
     @require_vision
     @require_torch
     def test_pipeline_mask_generation_fp16(self):
-        self.run_task_tests(task="mask-generation", torch_dtype='float16')
+        self.run_task_tests(task="mask-generation", torch_dtype="float16")
 
     @is_pipeline_test
     @require_vision
@@ -470,7 +478,7 @@ def test_pipeline_object_detection(self):
     @require_timm
     @require_torch
     def test_pipeline_object_detection_fp16(self):
-        self.run_task_tests(task="object-detection", torch_dtype='float16')
+        self.run_task_tests(task="object-detection", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_question_answering(self):
@@ -479,7 +487,7 @@ def test_pipeline_question_answering(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_question_answering_fp16(self):
-        self.run_task_tests(task="question-answering", torch_dtype='float16')
+        self.run_task_tests(task="question-answering", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_summarization(self):
@@ -488,7 +496,7 @@ def test_pipeline_summarization(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_summarization_fp16(self):
-        self.run_task_tests(task="summarization", torch_dtype='float16')
+        self.run_task_tests(task="summarization", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_table_question_answering(self):
@@ -497,7 +505,7 @@ def test_pipeline_table_question_answering(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_table_question_answering_fp16(self):
-        self.run_task_tests(task="table-question-answering", torch_dtype='float16')
+        self.run_task_tests(task="table-question-answering", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_text2text_generation(self):
@@ -506,7 +514,7 @@ def test_pipeline_text2text_generation(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_text2text_generation_fp16(self):
-        self.run_task_tests(task="text2text-generation", torch_dtype='float16')
+        self.run_task_tests(task="text2text-generation", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_text_classification(self):
@@ -515,7 +523,7 @@ def test_pipeline_text_classification(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_text_classification_fp16(self):
-        self.run_task_tests(task="text-classification", torch_dtype='float16')
+        self.run_task_tests(task="text-classification", torch_dtype="float16")
 
     @is_pipeline_test
     @require_torch_or_tf
@@ -525,7 +533,7 @@ def test_pipeline_text_generation(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_text_generation_fp16(self):
-        self.run_task_tests(task="text-generation", torch_dtype='float16')
+        self.run_task_tests(task="text-generation", torch_dtype="float16")
 
     @is_pipeline_test
     @require_torch
@@ -535,7 +543,7 @@ def test_pipeline_text_to_audio(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_text_to_audio_fp16(self):
-        self.run_task_tests(task="text-to-audio", torch_dtype='float16')
+        self.run_task_tests(task="text-to-audio", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_token_classification(self):
@@ -544,7 +552,7 @@ def test_pipeline_token_classification(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_token_classification_fp16(self):
-        self.run_task_tests(task="token-classification", torch_dtype='float16')
+        self.run_task_tests(task="token-classification", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_translation(self):
@@ -553,7 +561,7 @@ def test_pipeline_translation(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_translation_fp16(self):
-        self.run_task_tests(task="translation", torch_dtype='float16')
+        self.run_task_tests(task="translation", torch_dtype="float16")
 
     @is_pipeline_test
     @require_torch_or_tf
@@ -567,7 +575,7 @@ def test_pipeline_video_classification(self):
     @require_decord
     @require_torch
     def test_pipeline_video_classification_fp16(self):
-        self.run_task_tests(task="video-classification", torch_dtype='float16')
+        self.run_task_tests(task="video-classification", torch_dtype="float16")
 
     @is_pipeline_test
     @require_torch
@@ -579,7 +587,7 @@ def test_pipeline_visual_question_answering(self):
     @require_torch
     @require_vision
     def test_pipeline_visual_question_answering_fp16(self):
-        self.run_task_tests(task="visual-question-answering", torch_dtype='float16')
+        self.run_task_tests(task="visual-question-answering", torch_dtype="float16")
 
     @is_pipeline_test
     def test_pipeline_zero_shot(self):
@@ -588,7 +596,7 @@ def test_pipeline_zero_shot(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_zero_shot_fp16(self):
-        self.run_task_tests(task="zero-shot", torch_dtype='float16')
+        self.run_task_tests(task="zero-shot", torch_dtype="float16")
 
     @is_pipeline_test
     @require_torch
@@ -598,7 +606,7 @@ def test_pipeline_zero_shot_audio_classification(self):
     @is_pipeline_test
     @require_torch
     def test_pipeline_zero_shot_audio_classification_fp16(self):
-        self.run_task_tests(task="zero-shot-audio-classification", torch_dtype='float16')
+        self.run_task_tests(task="zero-shot-audio-classification", torch_dtype="float16")
 
     @is_pipeline_test
     @require_vision
@@ -609,7 +617,7 @@ def test_pipeline_zero_shot_image_classification(self):
     @require_vision
     @require_torch
     def test_pipeline_zero_shot_image_classification_fp16(self):
-        self.run_task_tests(task="zero-shot-image-classification", torch_dtype='float16')
+        self.run_task_tests(task="zero-shot-image-classification", torch_dtype="float16")
 
     @is_pipeline_test
     @require_vision
@@ -621,7 +629,7 @@ def test_pipeline_zero_shot_object_detection(self):
     @require_vision
     @require_torch
     def test_pipeline_zero_shot_object_detection_fp16(self):
-        self.run_task_tests(task="zero-shot-object-detection", torch_dtype='float16')
+        self.run_task_tests(task="zero-shot-object-detection", torch_dtype="float16")
 
     # This contains the test cases to be skipped without model architecture being involved.
     def is_pipeline_test_to_skip(
@@ -642,7 +650,9 @@ def is_pipeline_test_to_skip(
 
         return False
 
-    def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model, tokenizer, processor, torch_dtype):  # noqa
+    def is_pipeline_test_to_skip_more(
+        self, pipeline_test_casse_name, config, model, tokenizer, processor, torch_dtype
+    ):  # noqa
         """Skip some more tests based on the information from the instantiated objects."""
         # No fix is required for this case.
         if (

From 7528ae73e3da12da58b42d14c987a1423cdc3c60 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Tue, 25 Jun 2024 16:06:30 +0800
Subject: [PATCH 12/14] Remove torch_dtype arg from is_pipeline_test_to_skip*

---
 tests/test_pipeline_mixin.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index acd8de480874..2dcbdb0cf8b0 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -209,7 +209,6 @@ def run_model_pipeline_tests(
                     model_architecture,
                     tokenizer_name,
                     processor_name,
-                    torch_dtype,
                 ):
                     logger.warning(
                         f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
@@ -286,9 +285,7 @@ def run_pipeline_test(
             return
 
         pipeline_test_class_name = pipeline_test_mapping[task]["test"].__name__
-        if self.is_pipeline_test_to_skip_more(
-            pipeline_test_class_name, model.config, model, tokenizer, processor, torch_dtype
-        ):
+        if self.is_pipeline_test_to_skip_more(pipeline_test_class_name, model.config, model, tokenizer, processor):
             logger.warning(
                 f"{self.__class__.__name__}::test_pipeline_{task.replace('-', '_')}_{torch_dtype} is skipped: test is "
                 f"currently known to fail for: model `{model_architecture.__name__}` | tokenizer "
@@ -633,7 +630,7 @@ def test_pipeline_zero_shot_object_detection_fp16(self):
 
     # This contains the test cases to be skipped without model architecture being involved.
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name, torch_dtype
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
     ):
         """Skip some tests based on the classes or their names without the instantiated objects.
 
@@ -650,9 +647,7 @@ def is_pipeline_test_to_skip(
 
         return False
 
-    def is_pipeline_test_to_skip_more(
-        self, pipeline_test_casse_name, config, model, tokenizer, processor, torch_dtype
-    ):  # noqa
+    def is_pipeline_test_to_skip_more(self, pipeline_test_casse_name, config, model, tokenizer, processor):  # noqa
         """Skip some more tests based on the information from the instantiated objects."""
         # No fix is required for this case.
         if (

From 8fdb22d439a135336b4147ba7de492f9c8c79673 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Thu, 27 Jun 2024 21:50:08 +0800
Subject: [PATCH 13/14] Fix format

---
 tests/pipelines/test_pipelines_feature_extraction.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py
index f6a985a33b33..4d25941c3f0f 100644
--- a/tests/pipelines/test_pipelines_feature_extraction.py
+++ b/tests/pipelines/test_pipelines_feature_extraction.py
@@ -193,7 +193,9 @@ def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"):
                 For now ignore those.
                 """
             )
-        feature_extractor = FeatureExtractionPipeline(model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype)
+        feature_extractor = FeatureExtractionPipeline(
+            model=model, tokenizer=tokenizer, feature_extractor=processor, torch_dtype=torch_dtype
+        )
         return feature_extractor, ["This is a test", "This is another test"]
 
     def run_pipeline_test(self, feature_extractor, examples):

From 233ed50815772f7b4df5d1c2fa21ab9d4339c174 Mon Sep 17 00:00:00 2001
From: Billy Cao <aliencaocao@gmail.com>
Date: Fri, 28 Jun 2024 13:45:55 +0800
Subject: [PATCH 14/14] trigger ci