[SPARKNLP-1068] Introducing BLIPForQuestionAnswering transformer (#14422

) * [SPARKNLP-1068] Introducing BLIPForQuestionAnswering transformer * [SPARKNLP-1068] Adding BLIPForQuestionAnswering import notebook example * [SPARKNLP-1068] Fix fullAnnotateImage validation * [SPARKNLP-1068] Solves BLIPForQuestionAnsweringTest issue * [SPARKNLP-1068] Updates default BLIPForQuestionAnswering model name * [SPARKNLP-1068] [skip test] Adding documentation to BLIPForQuestionAnswering
JohnSnowLabs · Dec 9, 2024 · 6d3b273 · 6d3b273
1 parent 4b2aa30
commit 6d3b273
Show file tree

Hide file tree

Showing 22 changed files with 4,734 additions and 61 deletions.
diff --git a/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb
diff --git a/python/sparknlp/annotator/cv/__init__.py b/python/sparknlp/annotator/cv/__init__.py
@@ -16,3 +16,4 @@
 from sparknlp.annotator.cv.convnext_for_image_classification import *
 from sparknlp.annotator.cv.vision_encoder_decoder_for_image_captioning import *
 from sparknlp.annotator.cv.clip_for_zero_shot_classification import *
+from sparknlp.annotator.cv.blip_for_question_answering import *
diff --git a/python/sparknlp/annotator/cv/blip_for_question_answering.py b/python/sparknlp/annotator/cv/blip_for_question_answering.py
@@ -0,0 +1,172 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from sparknlp.common import *
+
+class BLIPForQuestionAnswering(AnnotatorModel,
+                               HasBatchedAnnotateImage,
+                               HasImageFeatureProperties,
+                               HasEngine,
+                               HasCandidateLabelsProperties,
+                               HasRescaleFactor):
+    """BLIPForQuestionAnswering can load BLIP models  for visual question answering.
+    The model consists of a vision encoder, a text encoder as well as a text decoder.
+    The vision encoder will encode the input image, the text encoder will encode the input question together
+    with the encoding of the image, and the text decoder will output the answer to the question.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
+    ...     .setInputCols(["image_assembler"]) \\
+    ...     .setOutputCol("answer")
+
+    The default model is ``"blip_vqa_base"``, if no name is
+    provided.
+
+    For available pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?task=Question+Answering>`__.
+
+    To see which models are compatible and how to import them see
+    `Import Transformers into Spark NLP 🚀
+    <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``IMAGE``              ``DOCUMENT``
+    ====================== ======================
+
+    Parameters
+    ----------
+    batchSize
+        Batch size. Large values allows faster processing but requires more
+        memory, by default 2
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    maxSentenceLength
+        Max sentence length to process, by default 50
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
+    >>> test_df = image_df.withColumn("text", lit("What's this picture about?"))
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
+    ...     .setOutputCol("image_assembler")
+    >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
+    ...     .setOutputCol("answer") \\
+    ...     .setSize(384)
+    >>> pipeline = Pipeline().setStages([
+    ...     imageAssembler,
+    ...     visualQAClassifier
+    ... ])
+    >>> result = pipeline.fit(test_df).transform(test_df)
+    >>> result.select("image_assembler.origin", "answer.result").show(false)
+    +--------------------------------------+------+
+    |origin                                |result|
+    +--------------------------------------+------+
+    |[file:///content/images/cat_image.jpg]|[cats]|
+    +--------------------------------------+------+
+    """
+
+    name = "BLIPForQuestionAnswering"
+
+    inputAnnotatorTypes = [AnnotatorType.IMAGE]
+
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with "
+                             "config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    maxSentenceLength = Param(Params._dummy(),
+                            "maxSentenceLength",
+                            "Maximum sentence length that the annotator will process. Above this, the sentence is skipped",
+                            typeConverter=TypeConverters.toInt)
+
+    def setMaxSentenceSize(self, value):
+        """Sets Maximum sentence length that the annotator will process, by
+        default 50.
+
+        Parameters
+        ----------
+        value : int
+            Maximum sentence length that the annotator will process
+        """
+        return self._set(maxSentenceLength=value)
+
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering",
+                 java_model=None):
+        super(BLIPForQuestionAnswering, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=2,
+            size=384,
+            maxSentenceLength=50
+        )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.internal import _BLIPForQuestionAnswering
+        jModel = _BLIPForQuestionAnswering(folder, spark_session._jsparkSession)._java_obj
+        return BLIPForQuestionAnswering(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="blip_vqa_base", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "blip_vqa_tf"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(BLIPForQuestionAnswering, name, lang, remote_loc)
diff --git a/python/sparknlp/base/image_assembler.py b/python/sparknlp/base/image_assembler.py
@@ -65,6 +65,7 @@ class ImageAssembler(AnnotatorTransformer):
     outputAnnotatorType = AnnotatorType.IMAGE
 
     inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
+    textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
     outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
     name = 'ImageAssembler'
 
@@ -101,3 +102,13 @@ def setOutputCol(self, value):
     def getOutputCol(self):
         """Gets output column name of annotations."""
         return self.getOrDefault(self.outputCol)
+
+    def setTextCol(self, value):
+        """Sets an optional text column name.
+
+        Parameters
+        ----------
+        value : str
+            Name of an optional input text column
+        """
+        return self._set(inputCol=value)
diff --git a/python/sparknlp/base/light_pipeline.py b/python/sparknlp/base/light_pipeline.py
@@ -277,7 +277,7 @@ def __fullAnnotateQuestionAnswering(self, question, context):
 
         return result
 
-    def fullAnnotateImage(self, path_to_image):
+    def fullAnnotateImage(self, path_to_image, text=None):
         """Annotates the data provided into `Annotation` type results.
 
         The data should be either a list or a str.
@@ -287,27 +287,38 @@ def fullAnnotateImage(self, path_to_image):
         path_to_image : list or str
             Source path of image, list of paths to images
 
+        text: list or str, optional
+           Optional list or str of texts. If None, defaults to empty list if path_to_image is a list, or empty string if path_to_image is a string.
+
         Returns
         -------
         List[AnnotationImage]
             The result of the annotation
         """
+        if not isinstance(path_to_image, (str, list)):
+            raise TypeError("argument for path_to_image must be 'str' or 'list[str]'")
+
+        if text is None:
+            text = "" if isinstance(path_to_image, str) else []
+
+        if type(path_to_image) != type(text):
+            raise ValueError("`path_to_image` and `text` must be of the same type")
+
         stages = self.pipeline_model.stages
         if not self._skipPipelineValidation(stages):
             self._validateStagesInputCols(stages)
 
-        if type(path_to_image) is str:
+        if isinstance(path_to_image, str):
             path_to_image = [path_to_image]
+            text = [text]
 
-        if type(path_to_image) is list:
-            result = []
+        result = []
 
-            for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image):
-                result.append(self.__buildStages(image_result))
+        for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image, text):
+            result.append(self.__buildStages(image_result))
+
+        return result
 
-            return result
-        else:
-            raise TypeError("argument for annotation may be 'str' or list[str]")
 
     def __buildStages(self, annotations_result):
         stages = {}

diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
@@ -1006,3 +1006,11 @@ def __init__(self, path, jspark):
         super(_SnowFlakeEmbeddingsLoader, self).__init__(
             "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark
         )
+
+class _BLIPForQuestionAnswering(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_BLIPForQuestionAnswering, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering.loadSavedModel",
+            path,
+            jspark,
+        )
diff --git a/python/test/annotator/cv/blip_for_question_answering_test.py b/python/test/annotator/cv/blip_for_question_answering_test.py
@@ -0,0 +1,80 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+import pytest
+import os
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from pyspark.sql.functions import lit
+from test.util import SparkSessionForTest
+
+
+class BLIPForQuestionAnsweringTestSetup(unittest.TestCase):
+
+    def setUp(self):
+        self.images_path = os.getcwd() + "/../src/test/resources/image/"
+        image_df = SparkSessionForTest.spark.read.format("image").load(
+            path=self.images_path
+        )
+
+        self.test_df = image_df.withColumn("text", lit("What's this picture about?"))
+
+        image_assembler = ImageAssembler().setInputCol("image").setOutputCol("image_assembler")
+
+        imageClassifier = BLIPForQuestionAnswering.pretrained() \
+            .setInputCols("image_assembler") \
+            .setOutputCol("answer") \
+            .setSize(384)
+
+        self.pipeline = Pipeline(
+            stages=[
+                image_assembler,
+                imageClassifier,
+            ]
+        )
+
+        self.model = self.pipeline.fit(self.test_df)
+
+@pytest.mark.slow
+class BLIPForQuestionAnsweringTest(BLIPForQuestionAnsweringTestSetup, unittest.TestCase):
+
+   def setUp(self):
+       super().setUp()
+
+   def runTest(self):
+       result = self.model.transform(self.test_df).collect()
+
+       for row in result:
+           self.assertTrue(row["answer"] != "")
+
+
+@pytest.mark.slow
+class LightBLIPForQuestionAnsweringTest(BLIPForQuestionAnsweringTestSetup, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+
+    def runTest(self):
+        light_pipeline = LightPipeline(self.model)
+        image_path = self.images_path + "bluetick.jpg"
+        print("image_path: " + image_path)
+        annotations_result = light_pipeline.fullAnnotateImage(
+            image_path,
+            "What's this picture about?"
+        )
+
+        for result in annotations_result:
+            self.assertTrue(len(result["image_assembler"]) > 0)
+            self.assertTrue(len(result["answer"]) > 0)