From 6d3b273d463d533e8d62dcedf97077f05d796597 Mon Sep 17 00:00:00 2001
From: Danilo Burbano <37355249+danilojsl@users.noreply.github.com>
Date: Mon, 9 Dec 2024 09:24:20 -0500
Subject: [PATCH] [SPARKNLP-1068] Introducing BLIPForQuestionAnswering
 transformer (#14422)

* [SPARKNLP-1068] Introducing BLIPForQuestionAnswering transformer

* [SPARKNLP-1068] Adding BLIPForQuestionAnswering import notebook example

* [SPARKNLP-1068] Fix fullAnnotateImage validation

* [SPARKNLP-1068] Solves BLIPForQuestionAnsweringTest issue

* [SPARKNLP-1068] Updates default BLIPForQuestionAnswering model name

* [SPARKNLP-1068] [skip test] Adding documentation to BLIPForQuestionAnswering
---
 ...n_Spark_NLP_BLIPForQuestionAnswering.ipynb | 3425 +++++++++++++++++
 python/sparknlp/annotator/cv/__init__.py      |    1 +
 .../cv/blip_for_question_answering.py         |  172 +
 python/sparknlp/base/image_assembler.py       |   11 +
 python/sparknlp/base/light_pipeline.py        |   29 +-
 python/sparknlp/internal/__init__.py          |    8 +
 .../cv/blip_for_question_answering_test.py    |   80 +
 .../johnsnowlabs/ml/ai/BLIPClassifier.scala   |  215 ++
 .../johnsnowlabs/nlp/AnnotationImage.scala    |   24 +-
 .../nlp/HasBatchedAnnotateImage.scala         |    3 +-
 .../com/johnsnowlabs/nlp/ImageAssembler.scala |   40 +-
 .../com/johnsnowlabs/nlp/LightPipeline.scala  |   81 +-
 .../cv/BLIPForQuestionAnswering.scala         |  384 ++
 .../tokenizer/bpe/BertTokenizer.scala         |   81 +
 .../tokenizer/bpe/BpeSpecialTokens.scala      |    8 +
 .../nlp/pretrained/PretrainedPipeline.scala   |   11 +-
 .../johnsnowlabs/nlp/AssertAnnotations.scala  |    9 +-
 .../johnsnowlabs/nlp/ImageAssemblerTest.scala |   29 +-
 .../cv/BLIPForQuestionAnsweringTest.scala     |  174 +
 ...LIPForZeroShotClassificationTestSpec.scala |    2 +-
 .../cv/ViTImageClassificationTestSpec.scala   |    6 +-
 ...derDecoderForImageCaptioningTestSpec.scala |    2 +-
 22 files changed, 4734 insertions(+), 61 deletions(-)
 create mode 100644 examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb
 create mode 100644 python/sparknlp/annotator/cv/blip_for_question_answering.py
 create mode 100644 python/test/annotator/cv/blip_for_question_answering_test.py
 create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala
 create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala
 create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala
 create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala

diff --git a/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb b/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb
new file mode 100644
index 00000000000000..c1e15d7d45bf1f
--- /dev/null
+++ b/examples/python/transformers/HuggingFace_in_Spark_NLP_BLIPForQuestionAnswering.ipynb
@@ -0,0 +1,3425 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "UiBTGTRfSCQh"
+   },
+   "source": [
+    "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_CLIP.ipynb)\n",
+    "\n",
+    "# Import ONNX BLIP models from HuggingFace 🤗 into Spark NLP 🚀\n",
+    "\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "- This feature is only in `Spark NLP 5.5.1` and after. So please make sure you have upgraded to the latest Spark NLP release\n",
+    "-  You can import BLIP models trained/fine-tuned for question answering via `TFBlipForQuestionAnswering`.\n",
+    "- Reference: [TFBlipForQuestionAnswering](https://huggingface.co/docs/transformers/en/model_doc/blip#transformers.TFBlipForQuestionAnswering)\n",
+    "- Some [example models](https://huggingface.co/models?pipeline_tag=visual-question-answering&sort=trending&search=BLIP)\n",
+    "- To execute this notebook on Google Colab you will need an A100 or similar instance"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vkGbcTagUK4P"
+   },
+   "source": [
+    "## Export and Save HuggingFace model\n",
+    "\n",
+    "- We lock TensorFlow on `2.11.0` version and Transformers on `4.39.3`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "N9RXtKzHaEvi",
+    "outputId": "5631c0ca-0f5f-4f38-c9ab-9a5591906067"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m46.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m77.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m22.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m86.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m41.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "cudf-cu12 24.4.1 requires protobuf<5,>=3.20, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-aiplatform 1.67.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-bigquery-storage 2.26.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-pubsub 2.23.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "googleapis-common-protos 1.65.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.\n",
+      "pandas-gbq 0.23.1 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n",
+      "tensorflow-datasets 4.9.6 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n",
+      "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.19.6 which is incompatible.\n",
+      "tf-keras 2.17.0 requires tensorflow<2.18,>=2.17, but you have tensorflow 2.11.0 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -q tensorflow==2.11.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "fIGek4zAUVM9"
+   },
+   "source": [
+    "- HuggingFace comes with a native `saved_model` feature inside `save_pretrained` function for TensorFlow based models. We will use that to save it as TF `SavedModel`.\n",
+    "- We'll use [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) model from HuggingFace as an example\n",
+    "- In addition to `TFBlipForQuestionAnswering` we also need to save the `BlipProcessor`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "n1tqMsNXK5lN"
+   },
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "from transformers import BlipProcessor, TFBlipForQuestionAnswering\n",
+    "import tensorflow as tf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "PiEKBy42ezX7"
+   },
+   "outputs": [],
+   "source": [
+    "MODEL_NAME = \"Salesforce/blip-vqa-base\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 353,
+     "referenced_widgets": [
+      "a8fc97ee9a5646268761e3362eb07ccd",
+      "0bf25fe03bcb4c9f9c0c2556d7a1ea99",
+      "58cac0f27ae347debd32014c34b37a1e",
+      "4e7a8a4a4bef4012bb7c8d3f31056ac2",
+      "bfbe18f452db43bea36212209eceac60",
+      "427370f1a81246fd85323abba58483ac",
+      "158c854e5e744216b485e8e0eaf33d14",
+      "d07cf17e58214062be88f5da1c55221b",
+      "2ea6b3a04c274905b5cdb76a4d1d197a",
+      "b03cae4fb10a47b5ac4b69cdaaa913d0",
+      "55e8c34dfbbb48f6b00a16762f107787",
+      "800ef838b66343659fffc789449c0a9f",
+      "22215a25c1f04cf3bc994b91716ecd91",
+      "a572bc9c98bb49598735bd4af9cef841",
+      "9c4125362fc44efea531faf2d48e6e04",
+      "a93f052249df447481ecf3531e52dcb2",
+      "ebf1f217cdef4024a9aecd90c2471986",
+      "98adb63f15664ac88046d941690cf13c",
+      "a2d6850c56e04bc08633717c569a6393",
+      "749cdc9d728e4ff18ec8192eb0062789",
+      "569e4bb367274c37bab0a314cd998e23",
+      "228cdee565d545f9a35b7bcbeafd29e7",
+      "cb4387e38cfb462ab8d53466ad9c69c8",
+      "26f1c75dbc8d4faab3c5874c1fbc9802",
+      "04e16cc0b237449299e3858c9db4295f",
+      "39a19e2bca9c4c1cb057cb225e90f0cf",
+      "9dfb9fa922954e2fac9867039e35a8bd",
+      "98f5799ac2314802a4d5565c05b93597",
+      "6331f40bb5394cb9b0ca9c5dfb104d6c",
+      "76f07bae7301446280b973486572e9fa",
+      "252ed515f22a48e2b97857e453945fb5",
+      "9717a812f3f84fc9ae100f9915f680df",
+      "22b606b09395484aaea3946d02319eca",
+      "2264d7fdc4a14032b4704c0caa64d8fb",
+      "b8c1b72a53ca4b14b7ff874942819011",
+      "c1048df076c946db8909c7091b82fcfa",
+      "6ee8baa1c4624a74835f0a434da22ce6",
+      "c375f592a3ab4dbbb2ff2dd98817dc1c",
+      "b71dcd5229a9409b83a45c561cd57489",
+      "9a0d0ec79a8142c3b5113bce264adeb9",
+      "3c2c91312ae146f8b1e95d3e81ad0056",
+      "ad23ef6e0c64424bb28127a9bf6b4951",
+      "7a99d35b201b45ceb9f18bb21bbf5cee",
+      "dfbd503e8f31449fa7c2358001fc77cb",
+      "151a916c65ee4196ae7cb53406365c45",
+      "33e4be1c2ce040baae33e3f100dad4f6",
+      "f71322f009844d02830f45b40632dc6a",
+      "58baacaa12b840ef9fb48bdd797ed498",
+      "ff0bd78c11b34f92a861029aeb3c9d3a",
+      "4f71c03378fc4ede80dd4c07b319df8d",
+      "4e345925052f464fb4aaaa92a1bd4fc7",
+      "e167c4bf6725441d89edcd705ba032be",
+      "eca99f2c5400456d92948305189d66a6",
+      "aebced9d65414171a2b8bc0602be1993",
+      "9c4c3703c5ed48c9a753797ee56b00fc"
+     ]
+    },
+    "id": "NgLAnDuhexzT",
+    "outputId": "0612907f-81f6-4526-e16a-25822771db73"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n",
+      "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
+      "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
+      "You will be able to reuse this secret in all of your notebooks.\n",
+      "Please note that authentication is recommended but still optional to access public models or datasets.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a8fc97ee9a5646268761e3362eb07ccd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "800ef838b66343659fffc789449c0a9f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cb4387e38cfb462ab8d53466ad9c69c8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2264d7fdc4a14032b4704c0caa64d8fb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "151a916c65ee4196ae7cb53406365c45",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "processor = BlipProcessor.from_pretrained(MODEL_NAME)\n",
+    "processor.save_pretrained(\"./{}_blip_processor\".format(MODEL_NAME))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 223,
+     "referenced_widgets": [
+      "4f5e6c1c45794f03aed2dd7223dd3255",
+      "16ddbd3fcb7f4dba8e8b48d6f6962046",
+      "d11879914a854d8a91a4872ef4afc942",
+      "ec039adb3b1f4522a7dac4386040590a",
+      "f7de63cc1da94daf9dc83406301873a3",
+      "111f56022b3c4737a9f643143673c6b5",
+      "af46ebc1d3d84a8589920ee7338936cf",
+      "caa25abd3df346da806da3659070ae87",
+      "0b1ed81f489c4fd09ab7bb1d1ad938fb",
+      "763498ed74e6446a972930ab96d5d4d8",
+      "18317efb0631479bbbd6f373942c7349",
+      "a9265e8b56b14330a51ac0e07faab189",
+      "39202d00e08f49d196159bdd16c29f6f",
+      "626dcbd9418949b0b7e5dc8680f9b19b",
+      "0e3e739b6a5c4e4aaec788974ef551b5",
+      "5ce925ad60054d518453a6c6ae8d1707",
+      "6fe7e0e408d54752ae71d47a58f31469",
+      "ddddfea881df4a7b89845fb4485edf0d",
+      "704723b61c674d3d9c322f6b31c9830a",
+      "559b67a1bb9240a887a34c9eafda45eb",
+      "7c4edf1f672042e68e6a15e7da5a0127",
+      "21951a3e1c6a4650851d4ee31cd2387f"
+     ]
+    },
+    "id": "g0HzbSHxMlnk",
+    "outputId": "99375ad2-c1b1-4929-8f94-1206f6d88265"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TF model\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4f5e6c1c45794f03aed2dd7223dd3255",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a9265e8b56b14330a51ac0e07faab189",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBlipForQuestionAnswering: ['text_encoder.embeddings.position_ids', 'text_decoder.bert.embeddings.position_ids']\n",
+      "- This IS expected if you are initializing TFBlipForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing TFBlipForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "All the weights of TFBlipForQuestionAnswering were initialized from the PyTorch model.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBlipForQuestionAnswering for predictions without further training.\n",
+      "WARNING:absl:Found untraced functions such as serving, serving, serving, serving, patch_embedding_layer_call_fn while saving (showing 5 of 1569). These functions will not be directly callable after loading.\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "  print(\"TF model\")\n",
+    "  model = TFBlipForQuestionAnswering.from_pretrained(MODEL_NAME)\n",
+    "except:\n",
+    "  print(\"TF model with pt\" )\n",
+    "  model = TFBlipForQuestionAnswering.from_pretrained(MODEL_NAME, from_pt=True)\n",
+    "\n",
+    "model.save_pretrained(\"./{}\".format(MODEL_NAME), saved_model=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "zgMzv4emGQXR",
+    "outputId": "5f8f1670-1d2e-4f1a-85ee-de7d8998e53c"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:From /usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
+      "Instructions for updating:\n",
+      "Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
+      "WARNING:tensorflow:AutoGraph could not transform <bound method PercentStyle._format of <logging.PercentStyle object at 0x7ccb5c8b5e10>> and will run it as-is.\n",
+      "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n",
+      "Cause: 'NoneType' object has no attribute '_fields'\n",
+      "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING: AutoGraph could not transform <bound method PercentStyle._format of <logging.PercentStyle object at 0x7ccb5c8b5e10>> and will run it as-is.\n",
+      "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n",
+      "Cause: 'NoneType' object has no attribute '_fields'\n",
+      "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.\n",
+      "WARNING:tensorflow:AutoGraph could not transform <bound method Socket.send of <zmq.Socket(zmq.PUSH) at 0x7ccb40f81420>> and will run it as-is.\n",
+      "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n",
+      "Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method\n",
+      "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING: AutoGraph could not transform <bound method Socket.send of <zmq.Socket(zmq.PUSH) at 0x7ccb40f81420>> and will run it as-is.\n",
+      "Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.\n",
+      "Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method\n",
+      "To silence this warning, decorate the function with @tf.autograph.experimental.do_not_convert\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tensorflow/python/autograph/impl/api.py:371: UserWarning: Using the model-agnostic default `max_length` (=20) to control the generation length.  recommend setting `max_new_tokens` to control the maximum length of the generation.\n",
+      "  return py_builtins.overload_of(f)(*args)\n",
+      "WARNING:absl:Found untraced functions such as serving, serving, serving, serving, patch_embedding_layer_call_fn while saving (showing 5 of 1569). These functions will not be directly callable after loading.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Define TF Signature\n",
+    "@tf.function(\n",
+    "  input_signature=[\n",
+    "      {\n",
+    "          \"pixel_values\": tf.TensorSpec((1, None, None, None), tf.float32, name=\"pixel_values\"),\n",
+    "          \"input_ids\": tf.TensorSpec((1, None), tf.int32, name=\"input_ids\"),\n",
+    "          \"attention_mask\": tf.TensorSpec((1, None), tf.int64, name=\"attention_mask\")\n",
+    "      }\n",
+    "  ]\n",
+    ")\n",
+    "def serving_fn(inputs):\n",
+    "   # Unpack the input dictionary and pass it to the model's generate function\n",
+    "    return model.generate(\n",
+    "        input_ids=inputs[\"input_ids\"],\n",
+    "        pixel_values=inputs[\"pixel_values\"],\n",
+    "        attention_mask=inputs.get(\"attention_mask\", None)\n",
+    "    )\n",
+    "\n",
+    "model.save_pretrained(\"./{}\".format(MODEL_NAME), saved_model=True, signatures={\"serving_default\": serving_fn.get_concrete_function()})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "FYF-xt3HWEr0"
+   },
+   "source": [
+    "Let's have a look inside these two directories and see what we are dealing with:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "oTlKokmrsVDR",
+    "outputId": "b56b637b-76a8-4471-f908-908dc44bd117"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 936\n",
+      "-rw-r--r-- 1 root root    471 Oct  2 18:10 preprocessor_config.json\n",
+      "-rw-r--r-- 1 root root    695 Oct  2 18:10 special_tokens_map.json\n",
+      "-rw-r--r-- 1 root root   1348 Oct  2 18:10 tokenizer_config.json\n",
+      "-rw-r--r-- 1 root root 711396 Oct  2 18:10 tokenizer.json\n",
+      "-rw-r--r-- 1 root root 231508 Oct  2 18:10 vocab.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -l {MODEL_NAME}_blip_processor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "hVzKx5bUWGny",
+    "outputId": "b4d9ae80-f865-4e1e-825c-a02a68ce9958"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 1503636\n",
+      "-rw-r--r-- 1 root root        664 Oct  2 18:18 config.json\n",
+      "-rw-r--r-- 1 root root        136 Oct  2 18:18 generation_config.json\n",
+      "drwxr-xr-x 3 root root       4096 Oct  2 18:14 saved_model\n",
+      "-rw-r--r-- 1 root root 1539703504 Oct  2 18:18 tf_model.h5\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -l {MODEL_NAME}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "JcEP4XF9WXYb",
+    "outputId": "2952576f-b7a6-411f-9487-605be09b654c"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 61764\n",
+      "drwxr-xr-x 2 root root     4096 Oct  2 18:14 assets\n",
+      "-rw-r--r-- 1 root root       55 Oct  2 18:18 fingerprint.pb\n",
+      "-rw-r--r-- 1 root root   604021 Oct  2 18:18 keras_metadata.pb\n",
+      "-rw-r--r-- 1 root root 62626669 Oct  2 18:18 saved_model.pb\n",
+      "drwxr-xr-x 2 root root     4096 Oct  2 18:17 variables\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -l {MODEL_NAME}/saved_model/1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WQ0yckQRsYCx"
+   },
+   "source": [
+    "So we need to move the files `preprocessor_config.json`, `tokenizer.json` and `vocab.txt` from processor to assets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HWaeOrl6UDOI"
+   },
+   "source": [
+    "- As you can see, we need the SavedModel from `saved_model/1/` path\n",
+    "- We also be needing `preprocessor_config.json`, `tokenizer.json` and `vocab.txt` from processor\n",
+    "- All we need is to just copy those files to `saved_model/1/assets` which Spark NLP will look for"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "id": "xiuyWqlLs4OL"
+   },
+   "outputs": [],
+   "source": [
+    "!mv {MODEL_NAME}_blip_processor/preprocessor_config.json {MODEL_NAME}/saved_model/1/assets\n",
+    "!mv {MODEL_NAME}_blip_processor/tokenizer.json {MODEL_NAME}/saved_model/1/assets\n",
+    "!mv {MODEL_NAME}_blip_processor/vocab.txt  {MODEL_NAME}/saved_model/1/assets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wa1yVpATVrZv"
+   },
+   "source": [
+    "Voila! We have our `preprocessor_config.json`, `tokenizer.json` and `vocab.txt` inside assets directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ljkBpPTftE8G",
+    "outputId": "e5922df7-f2be-409e-e395-83e2974a5750"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 928\n",
+      "-rw-r--r-- 1 root root    471 Oct  2 18:10 preprocessor_config.json\n",
+      "-rw-r--r-- 1 root root 711396 Oct  2 18:10 tokenizer.json\n",
+      "-rw-r--r-- 1 root root 231508 Oct  2 18:10 vocab.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -l {MODEL_NAME}/saved_model/1/assets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "7NdEMMiXTQbn"
+   },
+   "source": [
+    "## Import and Save BertForQuestionAnswering in Spark NLP"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YumDH6zHV1af"
+   },
+   "source": [
+    "Let's install and setup Spark NLP in Google Colab\n",
+    "This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "id": "Qb994CB80vU-"
+   },
+   "outputs": [],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "klO_mqUs1WgE",
+    "outputId": "ff8b25e6-ea0c-4d59-fded-db93e3213d97"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/lib/python3.10/subprocess.py:1796: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
+      "  self.pid = _posixsubprocess.fork_exec(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Apache Spark version: 3.4.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "# let's start Spark with Spark NLP\n",
+    "spark = sparknlp.start()\n",
+    "\n",
+    "print(\"Apache Spark version: {}\".format(spark.version))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Yj1LrqgXSp22"
+   },
+   "source": [
+    "- Let's use `loadSavedModel` functon in `BLIPForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n",
+    "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n",
+    "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "id": "s0IKr6l21dmt"
+   },
+   "outputs": [],
+   "source": [
+    "from sparknlp.annotator import *\n",
+    "from sparknlp.base import *\n",
+    "\n",
+    "blip_for_question_answering = BLIPForQuestionAnswering.loadSavedModel(\n",
+    "     '{}/saved_model/1'.format(MODEL_NAME),\n",
+    "     spark\n",
+    " )\\\n",
+    "  .setSize(384)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "S2SXFXqqV7io"
+   },
+   "source": [
+    "Let's save it on disk so it is easier to be moved around and also be used later via .load function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "id": "O_WLb5WTV-sI"
+   },
+   "outputs": [],
+   "source": [
+    "blip_for_question_answering.write().overwrite().save(\"./{}_spark_nlp\".format(MODEL_NAME))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8c-9B3fXWDqi"
+   },
+   "source": [
+    "Let's clean up stuff we don't need anymore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "id": "qNTTflXjWELp"
+   },
+   "outputs": [],
+   "source": [
+    "!rm -rf {MODEL_NAME}_blip_processor {MODEL_NAME}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bMNZ2gdcWPJI"
+   },
+   "source": [
+    "Awesome 😎  !\n",
+    "\n",
+    "This is your BLIPForQuestionAnswering model from HuggingFace 🤗  loaded and saved by Spark NLP 🚀"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "JPoiZrbg-agf",
+    "outputId": "e8be56dd-f998-499c-f8e5-b738ce81a989"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 1563412\n",
+      "-rw-r--r-- 1 root root 1600921187 Oct  2 18:42 blip_vqa_tensorflow\n",
+      "drwxr-xr-x 4 root root       4096 Oct  2 18:41 fields\n",
+      "drwxr-xr-x 2 root root       4096 Oct  2 18:41 metadata\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls -l {MODEL_NAME}_spark_nlp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Oizr-BZYWVmj"
+   },
+   "source": [
+    "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny BLIPForQuestionAnswering model in Spark NLP 🚀 pipeline!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kfXocFvjWbOq"
+   },
+   "source": [
+    "Let's try with a public image of cats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "qNGGZSbxAkSp",
+    "outputId": "70c64f2f-3347-460e-8df2-d02fb036ff32"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-10-02 18:42:30--  http://images.cocodataset.org/val2017/000000039769.jpg\n",
+      "Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.27.152, 3.5.29.161, 16.182.34.49, ...\n",
+      "Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.27.152|:80... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 173131 (169K) [image/jpeg]\n",
+      "Saving to: ‘/content/cat_image.jpg’\n",
+      "\n",
+      "/content/cat_image. 100%[===================>] 169.07K   312KB/s    in 0.5s    \n",
+      "\n",
+      "2024-10-02 18:42:31 (312 KB/s) - ‘/content/cat_image.jpg’ saved [173131/173131]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget -O /content/cat_image.jpg \"http://images.cocodataset.org/val2017/000000039769.jpg\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "id": "MDeYB-PGAvgA"
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir images\n",
+    "!mv cat_image.jpg images"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "l6Ii_rwDWn3J"
+   },
+   "source": [
+    "To proceed, please create a DataFrame with two columns:\n",
+    "\n",
+    "- An `image` column that contains the file path for each image in the directory.\n",
+    "- A `text` column where you can input the specific question you would like to ask about each image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "GlJRrn7NA5_3",
+    "outputId": "13703fbb-0085-49dd-9909-212bc45624f1"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+\n",
+      "|               image|                text|\n",
+      "+--------------------+--------------------+\n",
+      "|{file:///content/...|What's this pictu...|\n",
+      "+--------------------+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pyspark.sql.functions import lit\n",
+    "\n",
+    "images_path = \"./images/\"\n",
+    "image_df = spark.read.format(\"image\").load(path=images_path)\n",
+    "\n",
+    "test_df = image_df.withColumn(\"text\", lit(\"What's this picture about?\"))\n",
+    "test_df.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "XO8RXVifXNbZ"
+   },
+   "source": [
+    "Now let's build our `BLIPForQuestionAnswering` pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "id": "00MxfP2KBKpW"
+   },
+   "outputs": [],
+   "source": [
+    "imageAssembler = ImageAssembler() \\\n",
+    "  .setInputCol(\"image\") \\\n",
+    "  .setOutputCol(\"image_assembler\") \\\n",
+    "\n",
+    "imageClassifier = BLIPForQuestionAnswering.load(\"./{}_spark_nlp\".format(MODEL_NAME)) \\\n",
+    "  .setInputCols(\"image_assembler\") \\\n",
+    "  .setOutputCol(\"answer\") \\\n",
+    "  .setSize(384)\n",
+    "\n",
+    "pipeline = Pipeline(\n",
+    "    stages=[\n",
+    "        imageAssembler,\n",
+    "        imageClassifier,\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "id": "m3z6twXbBhw4"
+   },
+   "outputs": [],
+   "source": [
+    "model = pipeline.fit(test_df)\n",
+    "result = model.transform(test_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "_8NQhgilCGDO",
+    "outputId": "ed295952-9553-4780-f3fd-9a6adea89fe7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------------------------+------+\n",
+      "|origin                                |result|\n",
+      "+--------------------------------------+------+\n",
+      "|[file:///content/images/cat_image.jpg]|[cats]|\n",
+      "+--------------------------------------+------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "result.select(\"image_assembler.origin\", \"answer.result\").show(truncate = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YDvCiVP3XXPd"
+   },
+   "source": [
+    "That's it! You can now go wild and use hundreds of `BLIPForQuestionAnswering` models from HuggingFace 🤗 in Spark NLP 🚀\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "A100",
+   "machine_shape": "hm",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "04e16cc0b237449299e3858c9db4295f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_76f07bae7301446280b973486572e9fa",
+      "max": 231508,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_252ed515f22a48e2b97857e453945fb5",
+      "value": 231508
+     }
+    },
+    "0b1ed81f489c4fd09ab7bb1d1ad938fb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "0bf25fe03bcb4c9f9c0c2556d7a1ea99": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_427370f1a81246fd85323abba58483ac",
+      "placeholder": "​",
+      "style": "IPY_MODEL_158c854e5e744216b485e8e0eaf33d14",
+      "value": "preprocessor_config.json: 100%"
+     }
+    },
+    "0e3e739b6a5c4e4aaec788974ef551b5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7c4edf1f672042e68e6a15e7da5a0127",
+      "placeholder": "​",
+      "style": "IPY_MODEL_21951a3e1c6a4650851d4ee31cd2387f",
+      "value": " 1.54G/1.54G [00:51&lt;00:00, 29.4MB/s]"
+     }
+    },
+    "111f56022b3c4737a9f643143673c6b5": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "151a916c65ee4196ae7cb53406365c45": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_33e4be1c2ce040baae33e3f100dad4f6",
+       "IPY_MODEL_f71322f009844d02830f45b40632dc6a",
+       "IPY_MODEL_58baacaa12b840ef9fb48bdd797ed498"
+      ],
+      "layout": "IPY_MODEL_ff0bd78c11b34f92a861029aeb3c9d3a"
+     }
+    },
+    "158c854e5e744216b485e8e0eaf33d14": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "16ddbd3fcb7f4dba8e8b48d6f6962046": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_111f56022b3c4737a9f643143673c6b5",
+      "placeholder": "​",
+      "style": "IPY_MODEL_af46ebc1d3d84a8589920ee7338936cf",
+      "value": "config.json: 100%"
+     }
+    },
+    "18317efb0631479bbbd6f373942c7349": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "21951a3e1c6a4650851d4ee31cd2387f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "22215a25c1f04cf3bc994b91716ecd91": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_ebf1f217cdef4024a9aecd90c2471986",
+      "placeholder": "​",
+      "style": "IPY_MODEL_98adb63f15664ac88046d941690cf13c",
+      "value": "tokenizer_config.json: 100%"
+     }
+    },
+    "2264d7fdc4a14032b4704c0caa64d8fb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_b8c1b72a53ca4b14b7ff874942819011",
+       "IPY_MODEL_c1048df076c946db8909c7091b82fcfa",
+       "IPY_MODEL_6ee8baa1c4624a74835f0a434da22ce6"
+      ],
+      "layout": "IPY_MODEL_c375f592a3ab4dbbb2ff2dd98817dc1c"
+     }
+    },
+    "228cdee565d545f9a35b7bcbeafd29e7": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "22b606b09395484aaea3946d02319eca": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "252ed515f22a48e2b97857e453945fb5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "26f1c75dbc8d4faab3c5874c1fbc9802": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_98f5799ac2314802a4d5565c05b93597",
+      "placeholder": "​",
+      "style": "IPY_MODEL_6331f40bb5394cb9b0ca9c5dfb104d6c",
+      "value": "vocab.txt: 100%"
+     }
+    },
+    "2ea6b3a04c274905b5cdb76a4d1d197a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "33e4be1c2ce040baae33e3f100dad4f6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4f71c03378fc4ede80dd4c07b319df8d",
+      "placeholder": "​",
+      "style": "IPY_MODEL_4e345925052f464fb4aaaa92a1bd4fc7",
+      "value": "special_tokens_map.json: 100%"
+     }
+    },
+    "39202d00e08f49d196159bdd16c29f6f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_6fe7e0e408d54752ae71d47a58f31469",
+      "placeholder": "​",
+      "style": "IPY_MODEL_ddddfea881df4a7b89845fb4485edf0d",
+      "value": "model.safetensors: 100%"
+     }
+    },
+    "39a19e2bca9c4c1cb057cb225e90f0cf": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_9717a812f3f84fc9ae100f9915f680df",
+      "placeholder": "​",
+      "style": "IPY_MODEL_22b606b09395484aaea3946d02319eca",
+      "value": " 232k/232k [00:00&lt;00:00, 668kB/s]"
+     }
+    },
+    "3c2c91312ae146f8b1e95d3e81ad0056": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "427370f1a81246fd85323abba58483ac": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4e345925052f464fb4aaaa92a1bd4fc7": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "4e7a8a4a4bef4012bb7c8d3f31056ac2": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b03cae4fb10a47b5ac4b69cdaaa913d0",
+      "placeholder": "​",
+      "style": "IPY_MODEL_55e8c34dfbbb48f6b00a16762f107787",
+      "value": " 445/445 [00:00&lt;00:00, 32.3kB/s]"
+     }
+    },
+    "4f5e6c1c45794f03aed2dd7223dd3255": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_16ddbd3fcb7f4dba8e8b48d6f6962046",
+       "IPY_MODEL_d11879914a854d8a91a4872ef4afc942",
+       "IPY_MODEL_ec039adb3b1f4522a7dac4386040590a"
+      ],
+      "layout": "IPY_MODEL_f7de63cc1da94daf9dc83406301873a3"
+     }
+    },
+    "4f71c03378fc4ede80dd4c07b319df8d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "559b67a1bb9240a887a34c9eafda45eb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "55e8c34dfbbb48f6b00a16762f107787": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "569e4bb367274c37bab0a314cd998e23": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "58baacaa12b840ef9fb48bdd797ed498": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_aebced9d65414171a2b8bc0602be1993",
+      "placeholder": "​",
+      "style": "IPY_MODEL_9c4c3703c5ed48c9a753797ee56b00fc",
+      "value": " 125/125 [00:00&lt;00:00, 11.2kB/s]"
+     }
+    },
+    "58cac0f27ae347debd32014c34b37a1e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d07cf17e58214062be88f5da1c55221b",
+      "max": 445,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_2ea6b3a04c274905b5cdb76a4d1d197a",
+      "value": 445
+     }
+    },
+    "5ce925ad60054d518453a6c6ae8d1707": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "626dcbd9418949b0b7e5dc8680f9b19b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_704723b61c674d3d9c322f6b31c9830a",
+      "max": 1538800584,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_559b67a1bb9240a887a34c9eafda45eb",
+      "value": 1538800584
+     }
+    },
+    "6331f40bb5394cb9b0ca9c5dfb104d6c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "6ee8baa1c4624a74835f0a434da22ce6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7a99d35b201b45ceb9f18bb21bbf5cee",
+      "placeholder": "​",
+      "style": "IPY_MODEL_dfbd503e8f31449fa7c2358001fc77cb",
+      "value": " 711k/711k [00:00&lt;00:00, 1.37MB/s]"
+     }
+    },
+    "6fe7e0e408d54752ae71d47a58f31469": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "704723b61c674d3d9c322f6b31c9830a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "749cdc9d728e4ff18ec8192eb0062789": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "763498ed74e6446a972930ab96d5d4d8": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "76f07bae7301446280b973486572e9fa": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7a99d35b201b45ceb9f18bb21bbf5cee": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7c4edf1f672042e68e6a15e7da5a0127": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "800ef838b66343659fffc789449c0a9f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_22215a25c1f04cf3bc994b91716ecd91",
+       "IPY_MODEL_a572bc9c98bb49598735bd4af9cef841",
+       "IPY_MODEL_9c4125362fc44efea531faf2d48e6e04"
+      ],
+      "layout": "IPY_MODEL_a93f052249df447481ecf3531e52dcb2"
+     }
+    },
+    "9717a812f3f84fc9ae100f9915f680df": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "98adb63f15664ac88046d941690cf13c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "98f5799ac2314802a4d5565c05b93597": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "9a0d0ec79a8142c3b5113bce264adeb9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "9c4125362fc44efea531faf2d48e6e04": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_569e4bb367274c37bab0a314cd998e23",
+      "placeholder": "​",
+      "style": "IPY_MODEL_228cdee565d545f9a35b7bcbeafd29e7",
+      "value": " 592/592 [00:00&lt;00:00, 53.5kB/s]"
+     }
+    },
+    "9c4c3703c5ed48c9a753797ee56b00fc": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "9dfb9fa922954e2fac9867039e35a8bd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a2d6850c56e04bc08633717c569a6393": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a572bc9c98bb49598735bd4af9cef841": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a2d6850c56e04bc08633717c569a6393",
+      "max": 592,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_749cdc9d728e4ff18ec8192eb0062789",
+      "value": 592
+     }
+    },
+    "a8fc97ee9a5646268761e3362eb07ccd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_0bf25fe03bcb4c9f9c0c2556d7a1ea99",
+       "IPY_MODEL_58cac0f27ae347debd32014c34b37a1e",
+       "IPY_MODEL_4e7a8a4a4bef4012bb7c8d3f31056ac2"
+      ],
+      "layout": "IPY_MODEL_bfbe18f452db43bea36212209eceac60"
+     }
+    },
+    "a9265e8b56b14330a51ac0e07faab189": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_39202d00e08f49d196159bdd16c29f6f",
+       "IPY_MODEL_626dcbd9418949b0b7e5dc8680f9b19b",
+       "IPY_MODEL_0e3e739b6a5c4e4aaec788974ef551b5"
+      ],
+      "layout": "IPY_MODEL_5ce925ad60054d518453a6c6ae8d1707"
+     }
+    },
+    "a93f052249df447481ecf3531e52dcb2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ad23ef6e0c64424bb28127a9bf6b4951": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "aebced9d65414171a2b8bc0602be1993": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "af46ebc1d3d84a8589920ee7338936cf": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "b03cae4fb10a47b5ac4b69cdaaa913d0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b71dcd5229a9409b83a45c561cd57489": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b8c1b72a53ca4b14b7ff874942819011": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b71dcd5229a9409b83a45c561cd57489",
+      "placeholder": "​",
+      "style": "IPY_MODEL_9a0d0ec79a8142c3b5113bce264adeb9",
+      "value": "tokenizer.json: 100%"
+     }
+    },
+    "bfbe18f452db43bea36212209eceac60": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "c1048df076c946db8909c7091b82fcfa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_3c2c91312ae146f8b1e95d3e81ad0056",
+      "max": 711396,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_ad23ef6e0c64424bb28127a9bf6b4951",
+      "value": 711396
+     }
+    },
+    "c375f592a3ab4dbbb2ff2dd98817dc1c": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "caa25abd3df346da806da3659070ae87": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "cb4387e38cfb462ab8d53466ad9c69c8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_26f1c75dbc8d4faab3c5874c1fbc9802",
+       "IPY_MODEL_04e16cc0b237449299e3858c9db4295f",
+       "IPY_MODEL_39a19e2bca9c4c1cb057cb225e90f0cf"
+      ],
+      "layout": "IPY_MODEL_9dfb9fa922954e2fac9867039e35a8bd"
+     }
+    },
+    "d07cf17e58214062be88f5da1c55221b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d11879914a854d8a91a4872ef4afc942": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_caa25abd3df346da806da3659070ae87",
+      "max": 4559,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_0b1ed81f489c4fd09ab7bb1d1ad938fb",
+      "value": 4559
+     }
+    },
+    "ddddfea881df4a7b89845fb4485edf0d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "dfbd503e8f31449fa7c2358001fc77cb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "e167c4bf6725441d89edcd705ba032be": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ebf1f217cdef4024a9aecd90c2471986": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ec039adb3b1f4522a7dac4386040590a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_763498ed74e6446a972930ab96d5d4d8",
+      "placeholder": "​",
+      "style": "IPY_MODEL_18317efb0631479bbbd6f373942c7349",
+      "value": " 4.56k/4.56k [00:00&lt;00:00, 378kB/s]"
+     }
+    },
+    "eca99f2c5400456d92948305189d66a6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "f71322f009844d02830f45b40632dc6a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_e167c4bf6725441d89edcd705ba032be",
+      "max": 125,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_eca99f2c5400456d92948305189d66a6",
+      "value": 125
+     }
+    },
+    "f7de63cc1da94daf9dc83406301873a3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ff0bd78c11b34f92a861029aeb3c9d3a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/python/sparknlp/annotator/cv/__init__.py b/python/sparknlp/annotator/cv/__init__.py
index 7c89437989600b..37eeaf696bb2a8 100644
--- a/python/sparknlp/annotator/cv/__init__.py
+++ b/python/sparknlp/annotator/cv/__init__.py
@@ -16,3 +16,4 @@
 from sparknlp.annotator.cv.convnext_for_image_classification import *
 from sparknlp.annotator.cv.vision_encoder_decoder_for_image_captioning import *
 from sparknlp.annotator.cv.clip_for_zero_shot_classification import *
+from sparknlp.annotator.cv.blip_for_question_answering import *
\ No newline at end of file
diff --git a/python/sparknlp/annotator/cv/blip_for_question_answering.py b/python/sparknlp/annotator/cv/blip_for_question_answering.py
new file mode 100644
index 00000000000000..fe018c0e683bf2
--- /dev/null
+++ b/python/sparknlp/annotator/cv/blip_for_question_answering.py
@@ -0,0 +1,172 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from sparknlp.common import *
+
+class BLIPForQuestionAnswering(AnnotatorModel,
+                               HasBatchedAnnotateImage,
+                               HasImageFeatureProperties,
+                               HasEngine,
+                               HasCandidateLabelsProperties,
+                               HasRescaleFactor):
+    """BLIPForQuestionAnswering can load BLIP models  for visual question answering.
+    The model consists of a vision encoder, a text encoder as well as a text decoder.
+    The vision encoder will encode the input image, the text encoder will encode the input question together
+    with the encoding of the image, and the text decoder will output the answer to the question.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
+    ...     .setInputCols(["image_assembler"]) \\
+    ...     .setOutputCol("answer")
+
+    The default model is ``"blip_vqa_base"``, if no name is
+    provided.
+
+    For available pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?task=Question+Answering>`__.
+
+    To see which models are compatible and how to import them see
+    `Import Transformers into Spark NLP 🚀
+    <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``IMAGE``              ``DOCUMENT``
+    ====================== ======================
+
+    Parameters
+    ----------
+    batchSize
+        Batch size. Large values allows faster processing but requires more
+        memory, by default 2
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    maxSentenceLength
+        Max sentence length to process, by default 50
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> image_df = SparkSessionForTest.spark.read.format("image").load(path=images_path)
+    >>> test_df = image_df.withColumn("text", lit("What's this picture about?"))
+    >>> imageAssembler = ImageAssembler() \\
+    ...     .setInputCol("image") \\
+    ...     .setOutputCol("image_assembler")
+    >>> visualQAClassifier = BLIPForQuestionAnswering.pretrained() \\
+    ...     .setInputCols("image_assembler") \\
+    ...     .setOutputCol("answer") \\
+    ...     .setSize(384)
+    >>> pipeline = Pipeline().setStages([
+    ...     imageAssembler,
+    ...     visualQAClassifier
+    ... ])
+    >>> result = pipeline.fit(test_df).transform(test_df)
+    >>> result.select("image_assembler.origin", "answer.result").show(false)
+    +--------------------------------------+------+
+    |origin                                |result|
+    +--------------------------------------+------+
+    |[file:///content/images/cat_image.jpg]|[cats]|
+    +--------------------------------------+------+
+    """
+
+    name = "BLIPForQuestionAnswering"
+
+    inputAnnotatorTypes = [AnnotatorType.IMAGE]
+
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with "
+                             "config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    maxSentenceLength = Param(Params._dummy(),
+                            "maxSentenceLength",
+                            "Maximum sentence length that the annotator will process. Above this, the sentence is skipped",
+                            typeConverter=TypeConverters.toInt)
+
+    def setMaxSentenceSize(self, value):
+        """Sets Maximum sentence length that the annotator will process, by
+        default 50.
+
+        Parameters
+        ----------
+        value : int
+            Maximum sentence length that the annotator will process
+        """
+        return self._set(maxSentenceLength=value)
+
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering",
+                 java_model=None):
+        super(BLIPForQuestionAnswering, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=2,
+            size=384,
+            maxSentenceLength=50
+        )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.internal import _BLIPForQuestionAnswering
+        jModel = _BLIPForQuestionAnswering(folder, spark_session._jsparkSession)._java_obj
+        return BLIPForQuestionAnswering(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="blip_vqa_base", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "blip_vqa_tf"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        CLIPForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(BLIPForQuestionAnswering, name, lang, remote_loc)
\ No newline at end of file
diff --git a/python/sparknlp/base/image_assembler.py b/python/sparknlp/base/image_assembler.py
index 3214ff37324172..cc8a9eb8c91253 100644
--- a/python/sparknlp/base/image_assembler.py
+++ b/python/sparknlp/base/image_assembler.py
@@ -65,6 +65,7 @@ class ImageAssembler(AnnotatorTransformer):
     outputAnnotatorType = AnnotatorType.IMAGE
 
     inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString)
+    textCol = Param(Params._dummy(), "textCol", "text column name", typeConverter=TypeConverters.toString)
     outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString)
     name = 'ImageAssembler'
 
@@ -101,3 +102,13 @@ def setOutputCol(self, value):
     def getOutputCol(self):
         """Gets output column name of annotations."""
         return self.getOrDefault(self.outputCol)
+
+    def setTextCol(self, value):
+        """Sets an optional text column name.
+
+        Parameters
+        ----------
+        value : str
+            Name of an optional input text column
+        """
+        return self._set(inputCol=value)
diff --git a/python/sparknlp/base/light_pipeline.py b/python/sparknlp/base/light_pipeline.py
index 0622652fc01a42..4dd4f9128622ad 100644
--- a/python/sparknlp/base/light_pipeline.py
+++ b/python/sparknlp/base/light_pipeline.py
@@ -277,7 +277,7 @@ def __fullAnnotateQuestionAnswering(self, question, context):
 
         return result
 
-    def fullAnnotateImage(self, path_to_image):
+    def fullAnnotateImage(self, path_to_image, text=None):
         """Annotates the data provided into `Annotation` type results.
 
         The data should be either a list or a str.
@@ -287,27 +287,38 @@ def fullAnnotateImage(self, path_to_image):
         path_to_image : list or str
             Source path of image, list of paths to images
 
+        text: list or str, optional
+           Optional list or str of texts. If None, defaults to empty list if path_to_image is a list, or empty string if path_to_image is a string.
+
         Returns
         -------
         List[AnnotationImage]
             The result of the annotation
         """
+        if not isinstance(path_to_image, (str, list)):
+            raise TypeError("argument for path_to_image must be 'str' or 'list[str]'")
+
+        if text is None:
+            text = "" if isinstance(path_to_image, str) else []
+
+        if type(path_to_image) != type(text):
+            raise ValueError("`path_to_image` and `text` must be of the same type")
+
         stages = self.pipeline_model.stages
         if not self._skipPipelineValidation(stages):
             self._validateStagesInputCols(stages)
 
-        if type(path_to_image) is str:
+        if isinstance(path_to_image, str):
             path_to_image = [path_to_image]
+            text = [text]
 
-        if type(path_to_image) is list:
-            result = []
+        result = []
 
-            for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image):
-                result.append(self.__buildStages(image_result))
+        for image_result in self._lightPipeline.fullAnnotateImageJava(path_to_image, text):
+            result.append(self.__buildStages(image_result))
+
+        return result
 
-            return result
-        else:
-            raise TypeError("argument for annotation may be 'str' or list[str]")
 
     def __buildStages(self, annotations_result):
         stages = {}
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index 1ed209782bd18c..eec3544dc41c6f 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -1006,3 +1006,11 @@ def __init__(self, path, jspark):
         super(_SnowFlakeEmbeddingsLoader, self).__init__(
             "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark
         )
+
+class _BLIPForQuestionAnswering(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_BLIPForQuestionAnswering, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering.loadSavedModel",
+            path,
+            jspark,
+        )
\ No newline at end of file
diff --git a/python/test/annotator/cv/blip_for_question_answering_test.py b/python/test/annotator/cv/blip_for_question_answering_test.py
new file mode 100644
index 00000000000000..8eb0dbae3e70ae
--- /dev/null
+++ b/python/test/annotator/cv/blip_for_question_answering_test.py
@@ -0,0 +1,80 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+import pytest
+import os
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from pyspark.sql.functions import lit
+from test.util import SparkSessionForTest
+
+
+class BLIPForQuestionAnsweringTestSetup(unittest.TestCase):
+
+    def setUp(self):
+        self.images_path = os.getcwd() + "/../src/test/resources/image/"
+        image_df = SparkSessionForTest.spark.read.format("image").load(
+            path=self.images_path
+        )
+
+        self.test_df = image_df.withColumn("text", lit("What's this picture about?"))
+
+        image_assembler = ImageAssembler().setInputCol("image").setOutputCol("image_assembler")
+
+        imageClassifier = BLIPForQuestionAnswering.pretrained() \
+            .setInputCols("image_assembler") \
+            .setOutputCol("answer") \
+            .setSize(384)
+
+        self.pipeline = Pipeline(
+            stages=[
+                image_assembler,
+                imageClassifier,
+            ]
+        )
+
+        self.model = self.pipeline.fit(self.test_df)
+
+@pytest.mark.slow
+class BLIPForQuestionAnsweringTest(BLIPForQuestionAnsweringTestSetup, unittest.TestCase):
+
+   def setUp(self):
+       super().setUp()
+
+   def runTest(self):
+       result = self.model.transform(self.test_df).collect()
+
+       for row in result:
+           self.assertTrue(row["answer"] != "")
+
+
+@pytest.mark.slow
+class LightBLIPForQuestionAnsweringTest(BLIPForQuestionAnsweringTestSetup, unittest.TestCase):
+
+    def setUp(self):
+        super().setUp()
+
+    def runTest(self):
+        light_pipeline = LightPipeline(self.model)
+        image_path = self.images_path + "bluetick.jpg"
+        print("image_path: " + image_path)
+        annotations_result = light_pipeline.fullAnnotateImage(
+            image_path,
+            "What's this picture about?"
+        )
+
+        for result in annotations_result:
+            self.assertTrue(len(result["image_assembler"]) > 0)
+            self.assertTrue(len(result["answer"]) > 0)
\ No newline at end of file
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala b/src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala
new file mode 100644
index 00000000000000..3182d6dd0fdf92
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/BLIPClassifier.scala
@@ -0,0 +1,215 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.ml.ai
+
+import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager}
+import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper}
+import com.johnsnowlabs.nlp.annotators.common._
+import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor
+import com.johnsnowlabs.nlp.annotators.cv.util.io.ImageIOUtils
+import com.johnsnowlabs.nlp.annotators.cv.util.transform.ImageResizeUtils
+import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.BertTokenizer
+import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.WordpieceEncoder
+import com.johnsnowlabs.nlp.{Annotation, AnnotationImage}
+import org.tensorflow.ndarray.buffer.{IntDataBuffer, LongDataBuffer}
+
+import scala.collection.JavaConverters._
+
+private[johnsnowlabs] class BLIPClassifier(
+    val tensorflowWrapper: TensorflowWrapper,
+    configProtoBytes: Option[Array[Byte]] = None,
+    tokenizer: BertTokenizer,
+    preprocessor: Preprocessor,
+    signatures: Option[Map[String, String]] = None,
+    vocabulary: Map[String, Int])
+    extends Serializable {
+
+  private val _tfBLIPSignatures: Map[String, String] =
+    signatures.getOrElse(ModelSignatureManager.apply())
+
+  def predict(
+      images: Array[AnnotationImage],
+      questions: Seq[Annotation],
+      maxSentenceLength: Int,
+      batchSize: Int): Seq[Annotation] = {
+
+    val sentences = SentenceSplit.unpack(questions).toArray
+    val tokenizedSentences = TokenizedWithSentence.unpack(questions).toArray
+    val inputIds = encodeTokenizedSentence(
+      tokenizedSentences,
+      sentences,
+      batchSize,
+      maxSentenceLength,
+      caseSensitive = false)
+
+    val pixelValues = images
+      .grouped(batchSize)
+      .flatMap { batch =>
+        encodeImage(batch, preprocessor)
+      }
+      .toArray
+
+    val outputs = generate(pixelValues, inputIds, maxSentenceLength)
+    val decodedOutput = tokenizer.decodeTokens(outputs)
+    Seq(Annotation(decodedOutput))
+  }
+
+  def generate(
+      imagesBatch: Array[Array[Array[Array[Float]]]],
+      inputsBatch: Array[Array[Int]],
+      maxSentenceLength: Int): Array[Int] = {
+    val tensors = new TensorResources()
+    val imageTensors = tensors.createTensor(imagesBatch)
+
+    val batchLength = inputsBatch.length
+    // [nb of encoded sentences , maxSentenceLength]
+    val shape = Array(imagesBatch.length.toLong, maxSentenceLength)
+
+    val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength)
+    val maskBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength)
+
+    inputsBatch.zipWithIndex
+      .foreach { case (sentence, idx) =>
+        val offset = idx * maxSentenceLength
+        tokenBuffers.offset(offset).write(sentence)
+        maskBuffers.offset(offset).write(sentence.map(x => if (x == 0) 0L else 1L))
+      }
+
+    val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers)
+    val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers)
+
+    val runner = tensorflowWrapper
+      .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false)
+      .runner
+
+    runner
+      .feed(
+        _tfBLIPSignatures
+          .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_ids"),
+        tokenTensors)
+      .feed(
+        _tfBLIPSignatures
+          .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"),
+        maskTensors)
+      .feed(
+        _tfBLIPSignatures
+          .getOrElse(ModelSignatureConstants.PixelValuesInput.key, "missing_pixel_values"),
+        imageTensors)
+      .fetch(_tfBLIPSignatures
+        .getOrElse(ModelSignatureConstants.DecoderOutput.key, "missing_output"))
+
+    val outs = runner.run().asScala
+    val output = TensorResources.extractInts(outs.head)
+
+    tensors.clearSession(outs)
+    tensors.clearTensors()
+    imageTensors.close()
+
+    output
+  }
+
+  /** Calculate softmax from returned logits
+    * @param scores
+    *   logits output from output layer
+    * @return
+    */
+  def calculateSoftmax(scores: Array[Float]): Array[Float] = {
+    val exp = scores.map(x => math.exp(x))
+    exp.map(x => x / exp.sum).map(_.toFloat)
+  }
+
+  private def encodeImage(
+      annotations: Array[AnnotationImage],
+      preprocessor: Preprocessor): Array[Array[Array[Array[Float]]]] = {
+
+    val batchProcessedImages = annotations.map { annot =>
+      val bufferedImage = ImageIOUtils.byteToBufferedImage(
+        bytes = annot.result,
+        w = annot.width,
+        h = annot.height,
+        nChannels = annot.nChannels)
+
+      val resizedImage = if (preprocessor.do_resize) {
+        ImageResizeUtils.resizeBufferedImage(
+          width = preprocessor.size,
+          height = preprocessor.size,
+          preprocessor.resample)(bufferedImage)
+      } else bufferedImage
+
+      val normalizedImage =
+        ImageResizeUtils.normalizeAndConvertBufferedImage(
+          img = resizedImage,
+          mean = preprocessor.image_mean,
+          std = preprocessor.image_std,
+          doNormalize = preprocessor.do_normalize,
+          doRescale = preprocessor.do_rescale,
+          rescaleFactor = preprocessor.rescale_factor)
+
+      normalizedImage
+    }
+
+    batchProcessedImages
+
+  }
+
+  def encodeTokenizedSentence(
+      tokenizedSentences: Seq[TokenizedSentence],
+      sentences: Seq[Sentence],
+      batchSize: Int,
+      maxSentenceLength: Int,
+      caseSensitive: Boolean): Array[Array[Int]] = {
+    val wordPieceTokenizedSentences =
+      tokenizeWithAlignment(tokenizedSentences, maxSentenceLength, caseSensitive)
+
+    /*Run calculation by batches*/
+    wordPieceTokenizedSentences
+      .zip(sentences)
+      .zipWithIndex
+      .grouped(batchSize)
+      .flatMap { batch =>
+        val tokensBatch = batch.map(x => (x._1._1, x._2))
+        tokenizer.encode(tokensBatch, maxSentenceLength)
+      }
+      .toArray
+  }
+
+  def tokenizeWithAlignment(
+      sentences: Seq[TokenizedSentence],
+      maxSeqLength: Int,
+      caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = {
+
+    val encoder = new WordpieceEncoder(vocabulary)
+
+    sentences.map { tokenIndex =>
+      // filter empty and only whitespace tokens
+      val bertTokens =
+        tokenIndex.indexedTokens.filter(x => x.token.nonEmpty && !x.token.equals(" ")).map {
+          token =>
+            val content = if (caseSensitive) token.token else token.token.toLowerCase()
+            val sentenceBegin = token.begin
+            val sentenceEnd = token.end
+            val sentenceIndex = tokenIndex.sentenceIndex
+            val result =
+              tokenizer.tokenize(Sentence(content, sentenceBegin, sentenceEnd, sentenceIndex))
+            if (result.nonEmpty) result.head else IndexedToken("")
+        }
+      val wordpieceTokens = bertTokens.flatMap(token => encoder.encode(token)).take(maxSeqLength)
+      WordpieceTokenizedSentence(wordpieceTokens)
+    }
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala b/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala
index 72ef1c6d73a123..b566c3c5ccb7ea 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/AnnotationImage.scala
@@ -48,7 +48,8 @@ case class AnnotationImage(
     nChannels: Int,
     mode: Int,
     result: Array[Byte],
-    metadata: Map[String, String])
+    metadata: Map[String, String],
+    text: String = "")
     extends IAnnotation {
 
   override def equals(obj: Any): Boolean = {
@@ -61,7 +62,8 @@ case class AnnotationImage(
         this.nChannels == annotation.nChannels &&
         this.mode == annotation.mode &&
         this.result.sameElements(annotation.result) &&
-        this.metadata == annotation.metadata
+        this.metadata == annotation.metadata &&
+        this.text == annotation.text
       case _ => false
     }
   }
@@ -94,6 +96,10 @@ case class AnnotationImage(
     metadata
   }
 
+  def getText: String = {
+    text
+  }
+
 }
 
 object AnnotationImage {
@@ -112,7 +118,8 @@ object AnnotationImage {
       StructField("mode", IntegerType, nullable = false),
       // Bytes in OpenCV-compatible order: row-wise BGR in most cases
       StructField("result", BinaryType, nullable = false),
-      StructField("metadata", MapType(StringType, StringType), nullable = true)))
+      StructField("metadata", MapType(StringType, StringType), nullable = true),
+      StructField("text", StringType, nullable = true)))
 
   val arrayType = new ArrayType(dataType, true)
 
@@ -122,7 +129,8 @@ object AnnotationImage {
       width: Int,
       nChannels: Int,
       mode: Int,
-      result: Array[Byte])
+      result: Array[Byte],
+      text: String)
 
   /** This method converts a [[org.apache.spark.sql.Row]] into an [[AnnotationImage]]
     *
@@ -132,6 +140,7 @@ object AnnotationImage {
     *   AnnotationImage
     */
   def apply(row: Row): AnnotationImage = {
+    println(s"row.getString(8): ${row.getString(8)}")
     AnnotationImage(
       row.getString(0),
       row.getString(1),
@@ -140,7 +149,8 @@ object AnnotationImage {
       row.getInt(4),
       row.getInt(5),
       row.getAs[Array[Byte]](6),
-      row.getMap[String, String](7))
+      row.getMap[String, String](7),
+      row.getString(8))
   }
 
   def apply(image: ImageFields): AnnotationImage =
@@ -152,6 +162,6 @@ object AnnotationImage {
       nChannels = image.nChannels,
       mode = image.mode,
       result = Array.emptyByteArray,
-      Map.empty[String, String])
-
+      metadata = Map.empty[String, String],
+      text = image.text)
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala
index ded31e5e59cb51..d105c879143fbb 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasBatchedAnnotateImage.scala
@@ -65,7 +65,8 @@ trait HasBatchedAnnotateImage[M <: Model[M]] {
                   r.getInt(4),
                   r.getInt(5),
                   r.getAs(6),
-                  r.getMap[String, String](7)))
+                  r.getMap[String, String](7),
+                  r.getString(8)))
           })
         })
         val outputAnnotations = batchAnnotate(inputAnnotations)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala
index 3ef7ccd67d9803..73b08bae40d695 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/ImageAssembler.scala
@@ -110,7 +110,26 @@ class ImageAssembler(override val uid: String)
     */
   def getInputCol: String = $(inputCol)
 
-  setDefault(inputCol -> IMAGE, outputCol -> "image_assembler")
+  /** Input text column for processing
+    *
+    * @group param
+    */
+  val textCol: Param[String] =
+    new Param[String](this, "textCol", "input text column for processing")
+
+  /** Input text column for processing
+    *
+    * @group setParam
+    */
+  def setTextCol(value: String): this.type = set(textCol, value)
+
+  /** Input text column for processing
+    *
+    * @group getParam
+    */
+  def getTextCol: String = $(textCol)
+
+  setDefault(inputCol -> IMAGE, outputCol -> "image_assembler", textCol -> "text")
 
   def this() = this(Identifiable.randomUID("ImageAssembler"))
 
@@ -118,7 +137,8 @@ class ImageAssembler(override val uid: String)
 
   private[nlp] def assemble(
       image: Option[ImageFields],
-      metadata: Map[String, String]): Seq[AnnotationImage] = {
+      metadata: Map[String, String],
+      text: Option[String] = None): Seq[AnnotationImage] = {
 
     if (image.isDefined) {
       Seq(
@@ -130,14 +150,21 @@ class ImageAssembler(override val uid: String)
           nChannels = image.get.nChannels,
           mode = image.get.mode,
           result = image.get.data,
-          metadata = metadata))
+          metadata = metadata,
+          text = text.getOrElse("")))
     } else Seq.empty
 
   }
 
   private[nlp] def dfAssemble: UserDefinedFunction = udf { (image: ImageFields) =>
     // Apache Spark has only 1 image per row
-    assemble(Some(image), Map("image" -> "0"))
+    assemble(Some(image), Map("image" -> "0"), None)
+  }
+
+  private[nlp] def dfAssembleWithText: UserDefinedFunction = udf {
+    (image: ImageFields, text: String) =>
+      // Apache Spark has only 1 image per row
+      assemble(Some(image), Map("image" -> "0"), Some(text))
   }
 
   /** requirement for pipeline transformation validation. It is called on fit() */
@@ -163,7 +190,10 @@ class ImageAssembler(override val uid: String)
       ImageSchemaUtils.isImage(dataset.schema(getInputCol)),
       s"column $getInputCol doesn't have Apache Spark ImageSchema. Make sure you read your images via spark.read.format(image).load(PATH)")
 
-    val imageAnnotations = {
+    val textColExists = dataset.schema.fields.exists(_.name == getTextCol)
+    val imageAnnotations = if (textColExists) {
+      dfAssembleWithText(dataset.col($(inputCol)), dataset.col($(textCol)))
+    } else {
       dfAssemble(dataset($(inputCol)))
     }
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala b/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala
index 2271bd945c64b5..20236a5732f3fd 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/LightPipeline.scala
@@ -44,7 +44,7 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
 
   def fullAnnotate(target: String, optionalTarget: String = ""): Map[String, Seq[IAnnotation]] = {
     if (target.contains("/") && ResourceHelper.validFile(target)) {
-      fullAnnotateImage(target)
+      fullAnnotateImage(target, optionalTarget)
     } else {
       fullAnnotateInternal(target, optionalTarget)
     }
@@ -60,7 +60,7 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
     }
 
     if (targets.head.contains("/") && ResourceHelper.validFile(targets.head)) {
-      targets.par.map(target => fullAnnotateImage(target)).toArray
+      fullAnnotateImages(targets, optionalTargets)
     } else {
       (targets zip optionalTargets).par.map { case (target, optionalTarget) =>
         fullAnnotate(target, optionalTarget)
@@ -68,14 +68,19 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
     }
   }
 
-  def fullAnnotateImage(pathToImages: Array[String]): Array[Map[String, Seq[IAnnotation]]] = {
-    pathToImages.par
-      .map(imageFilePath => fullAnnotateInternal(imageFilePath))
-      .toArray
+  def fullAnnotateImages(
+      pathToImages: Array[String],
+      texts: Array[String] = Array.empty): Array[Map[String, Seq[IAnnotation]]] = {
+    val safeTexts = if (texts.isEmpty) Array.fill(pathToImages.length)("") else texts
+    (pathToImages zip safeTexts).par.map { case (imageFilePath, text) =>
+      fullAnnotateImage(imageFilePath, text)
+    }.toArray
   }
 
-  def fullAnnotateImage(pathToImage: String): Map[String, Seq[IAnnotation]] = {
-    fullAnnotateInternal(pathToImage)
+  def fullAnnotateImage(pathToImage: String, text: String = ""): Map[String, Seq[IAnnotation]] = {
+    if (!ResourceHelper.validFile(pathToImage)) {
+      Map()
+    } else fullAnnotateInternal(pathToImage, text)
   }
 
   def fullAnnotate(audio: Array[Double]): Map[String, Seq[IAnnotation]] = {
@@ -108,7 +113,7 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
             optionalTarget,
             annotations)
         case imageAssembler: ImageAssembler =>
-          processImageAssembler(target, imageAssembler, annotations)
+          processImageAssembler(target, optionalTarget, imageAssembler, annotations)
         case audioAssembler: AudioAssembler =>
           processAudioAssembler(audio, audioAssembler, annotations)
         case lazyAnnotator: AnnotatorModel[_] if lazyAnnotator.getLazyAnnotator => annotations
@@ -157,12 +162,13 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
 
   private def processImageAssembler(
       target: String,
+      text: String,
       imageAssembler: ImageAssembler,
       annotations: Map[String, Seq[IAnnotation]]): Map[String, Seq[IAnnotation]] = {
     val currentImageFields = ImageIOUtils.imagePathToImageFields(target)
     annotations.updated(
       imageAssembler.getOutputCol,
-      imageAssembler.assemble(currentImageFields, Map.empty[String, String]))
+      imageAssembler.assemble(currentImageFields, Map.empty[String, String], Some(text)))
   }
 
   private def processAudioAssembler(
@@ -209,9 +215,9 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
       getCombinedAnnotations(batchedAnnotatorImage.getInputCols, annotations)
     val batchedAnnotations = Seq(combinedAnnotations.map(_.asInstanceOf[AnnotationImage]))
 
-    annotations.updated(
-      batchedAnnotatorImage.getOutputCol,
-      batchedAnnotatorImage.batchAnnotate(batchedAnnotations).head)
+    val outputCol = batchedAnnotatorImage.getOutputCol
+    val annotateResult = batchedAnnotatorImage.batchAnnotate(batchedAnnotations)
+    annotations.updated(outputCol, annotateResult.head)
   }
 
   private def processBatchedAnnotatorAudio(
@@ -361,15 +367,34 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
     fullAnnotateImage(pathToImage).mapValues(_.asJava).asJava
   }
 
-  def fullAnnotateImageJava(pathToImages: java.util.ArrayList[String])
+  import scala.collection.JavaConverters._
+
+  def fullAnnotateImageJava(
+      pathToImages: java.util.ArrayList[String],
+      texts: java.util.ArrayList[String])
       : java.util.List[java.util.Map[String, java.util.List[IAnnotation]]] = {
+    if (texts.isEmpty) {
+      pathToImages.asScala.par
+        .map { imageFilePath =>
+          fullAnnotateInternal(imageFilePath).mapValues(_.asJava).asJava
+        }
+        .toList
+        .asJava
+    } else {
 
-    pathToImages.asScala.par
-      .map { imageFilePath =>
-        fullAnnotateInternal(imageFilePath).mapValues(_.asJava).asJava
+      if (pathToImages.size != texts.size) {
+        throw new IllegalArgumentException(
+          "pathToImages and texts must have the same number of elements.")
       }
-      .toList
-      .asJava
+      val imageTextPairs = pathToImages.asScala.zip(texts.asScala).par
+
+      imageTextPairs
+        .map { case (imageFilePath, text) =>
+          fullAnnotateImage(imageFilePath, text).mapValues(_.asJava).asJava
+        }
+        .toList
+        .asJava
+    }
   }
 
   def fullAnnotateSingleAudioJava(
@@ -394,14 +419,16 @@ class LightPipeline(val pipelineModel: PipelineModel, parseEmbeddings: Boolean =
   }
 
   def annotate(target: String, optionalTarget: String = ""): Map[String, Seq[String]] = {
-    fullAnnotate(target, optionalTarget).mapValues(_.map { iAnnotation =>
-      val annotation = iAnnotation.asInstanceOf[Annotation]
-      annotation.annotatorType match {
-        case AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS
-            if parseEmbeddings =>
-          annotation.embeddings.mkString(" ")
-        case _ => annotation.result
-      }
+    val annotations = fullAnnotate(target, optionalTarget)
+    annotations.mapValues(_.map {
+      case annotation: Annotation =>
+        annotation.annotatorType match {
+          case AnnotatorType.WORD_EMBEDDINGS | AnnotatorType.SENTENCE_EMBEDDINGS
+              if parseEmbeddings =>
+            annotation.embeddings.mkString(" ")
+          case _ => annotation.result
+        }
+      case _ => ""
     })
   }
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala
new file mode 100644
index 00000000000000..a0f15de929cafb
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnswering.scala
@@ -0,0 +1,384 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.cv
+
+import com.johnsnowlabs.ml.ai.BLIPClassifier
+import com.johnsnowlabs.ml.tensorflow.{
+  ReadTensorflowModel,
+  TensorflowWrapper,
+  WriteTensorflowModel
+}
+import com.johnsnowlabs.ml.util.LoadExternalModel.{
+  loadJsonStringAsset,
+  loadTextAsset,
+  modelSanityCheck,
+  notSupportedEngineError
+}
+import com.johnsnowlabs.ml.util.TensorFlow
+import com.johnsnowlabs.nlp.AnnotatorType.{DOCUMENT, IMAGE}
+import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.nlp.annotators.RegexTokenizer
+import com.johnsnowlabs.nlp.annotators.cv.feature_extractor.Preprocessor
+import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
+import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BertTokenizer, SpecialTokens}
+import com.johnsnowlabs.nlp.serialization.MapFeature
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param.{IntArrayParam, IntParam}
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.SparkSession
+
+/** BLIPForQuestionAnswering can load BLIP models for visual question answering. The model
+  * consists of a vision encoder, a text encoder as well as a text decoder. The vision encoder
+  * will encode the input image, the text encoder will encode the input question together with the
+  * encoding of the image, and the text decoder will output the answer to the question.
+  *
+  * Pretrained models can be loaded with `pretrained` of the companion object:
+  * {{{
+  * val visualQAClassifier = BLIPForQuestionAnswering.pretrained()
+  *   .setInputCols("image_assembler")
+  *   .setOutputCol("answer")
+  * }}}
+  * The default model is `"blip_vqa_base"`, if no name is provided.
+  *
+  * For available pretrained models please see the
+  * [[https://sparknlp.org/models?task=Question+Answering Models Hub]].
+  *
+  * Models from the HuggingFace 🤗 Transformers library are also compatible with Spark NLP 🚀. To
+  * see which models are compatible and how to import them see
+  * [[https://github.com/JohnSnowLabs/spark-nlp/discussions/5669]] and to see more extended
+  * examples, see
+  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala]].
+  *
+  * ==Example==
+  * {{{
+  * import spark.implicits._
+  * import com.johnsnowlabs.nlp.base._
+  * import com.johnsnowlabs.nlp.annotator._
+  * import org.apache.spark.ml.Pipeline
+  *
+  * val imageDF: DataFrame = ResourceHelper.spark.read
+  *  .format("image")
+  *  .option("dropInvalid", value = true)
+  *  .load(imageFolder)
+  *
+  * val testDF: DataFrame = imageDF.withColumn("text", lit("What's this picture about?"))
+  *
+  * val imageAssembler: ImageAssembler = new ImageAssembler()
+  *   .setInputCol("image")
+  *   .setOutputCol("image_assembler")
+  *
+  * val visualQAClassifier = BLIPForQuestionAnswering.pretrained()
+  *   .setInputCols("image_assembler")
+  *   .setOutputCol("answer")
+  *
+  * val pipeline = new Pipeline().setStages(Array(
+  *   imageAssembler,
+  *   visualQAClassifier
+  * ))
+  *
+  * val result = pipeline.fit(testDF).transform(testDF)
+  *
+  * result.select("image_assembler.origin", "answer.result").show(false)
+  * +--------------------------------------+------+
+  * |origin                                |result|
+  * +--------------------------------------+------+
+  * |[file:///content/images/cat_image.jpg]|[cats]|
+  * +--------------------------------------+------+
+  * }}}
+  *
+  * @see
+  *   [[CLIPForZeroShotClassification]] for Zero Shot Image Classifier
+  * @see
+  *   [[https://sparknlp.org/docs/en/annotators Annotators Main Page]] for a list of transformer
+  *   based classifiers
+  * @param uid
+  *   required uid for storing annotator to disk
+  * @groupname anno Annotator types
+  * @groupdesc anno
+  *   Required input and expected output annotator types
+  * @groupname Ungrouped Members
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupname Ungrouped Members
+  * @groupprio param  1
+  * @groupprio anno  2
+  * @groupprio Ungrouped 3
+  * @groupprio setParam  4
+  * @groupprio getParam  5
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+
+class BLIPForQuestionAnswering(override val uid: String)
+    extends AnnotatorModel[BLIPForQuestionAnswering]
+    with HasBatchedAnnotateImage[BLIPForQuestionAnswering]
+    with HasImageFeatureProperties
+    with WriteTensorflowModel
+    with HasEngine {
+
+  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
+    * type
+    */
+  def this() = this(Identifiable.randomUID("BLIPForQuestionAnswering"))
+
+  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
+    * type
+    */
+  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(IMAGE)
+  override val outputAnnotatorType: AnnotatorType = DOCUMENT
+
+  /** ConfigProto from tensorflow, serialized into byte array. Get with
+    * config_proto.SerializeToString()
+    *
+    * @group param
+    */
+  val configProtoBytes = new IntArrayParam(
+    this,
+    "configProtoBytes",
+    "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()")
+
+  /** ConfigProto from tensorflow, serialized into byte array. Get with
+    * config_proto.SerializeToString()
+    *
+    * @group setParam
+    */
+  def setConfigProtoBytes(bytes: Array[Int]): BLIPForQuestionAnswering.this.type =
+    set(this.configProtoBytes, bytes)
+
+  /** ConfigProto from tensorflow, serialized into byte array. Get with
+    * config_proto.SerializeToString()
+    *
+    * @group getParam
+    */
+  def getConfigProtoBytes: Option[Array[Byte]] =
+    get(this.configProtoBytes).map(_.map(_.toByte))
+
+  /** It contains TF model signatures for the laded saved model
+    *
+    * @group param
+    */
+  val signatures =
+    new MapFeature[String, String](model = this, name = "signatures").setProtected()
+
+  /** @group setParam */
+  def setSignatures(value: Map[String, String]): this.type = {
+    set(signatures, value)
+    this
+  }
+
+  /** @group getParam */
+  def getSignatures: Option[Map[String, String]] = get(this.signatures)
+
+  /** Vocabulary used to encode the words to ids with WordPieceEncoder
+    *
+    * @group param
+    */
+  val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected()
+
+  /** @group setParam */
+  def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value)
+
+  /** @group getParam */
+  protected[nlp] def getVocabulary: Map[String, Int] = $$(vocabulary)
+
+  /** Max sentence length to process (Default: `512`)
+    *
+    * @group param
+    */
+  val maxSentenceLength =
+    new IntParam(this, "maxSentenceLength", "Max sentence length to process")
+
+  /** @group setParam */
+  def setMaxSentenceLength(value: Int): this.type = {
+    set(maxSentenceLength, value)
+    this
+  }
+
+  /** @group getParam */
+  def getMaxSentenceLength: Int = $(maxSentenceLength)
+
+  private var _model: Option[Broadcast[BLIPClassifier]] = None
+
+  /** @group setParam */
+  def setModelIfNotSet(
+      spark: SparkSession,
+      preprocessor: Preprocessor,
+      tensorflow: TensorflowWrapper): this.type = {
+    if (_model.isEmpty) {
+
+      val specialTokens = SpecialTokens.getSpecialTokensForModel("bert", getVocabulary)
+      val bertTokenizer = new BertTokenizer(getVocabulary, specialTokens)
+
+      _model = Some(
+        spark.sparkContext.broadcast(
+          new BLIPClassifier(
+            tensorflow,
+            configProtoBytes = getConfigProtoBytes,
+            tokenizer = bertTokenizer,
+            preprocessor = preprocessor,
+            signatures = getSignatures,
+            vocabulary = $$(vocabulary))))
+    }
+    this
+  }
+
+  /** @group getParam */
+  def getModelIfNotSet: BLIPClassifier = _model.get.value
+
+  setDefault(batchSize -> 8, size -> 384, maxSentenceLength -> 50)
+
+  /** takes a document and annotations and produces new annotations of this annotator's annotation
+    * type
+    *
+    * @param batchedAnnotations
+    *   Annotations in batches that correspond to inputAnnotationCols generated by previous
+    *   annotators if any
+    * @return
+    *   any number of annotations processed for every batch of input annotations. Not necessary
+    *   one to one relationship
+    */
+  override def batchAnnotate(
+      batchedAnnotations: Seq[Array[AnnotationImage]]): Seq[Seq[Annotation]] = {
+
+    batchedAnnotations
+      .filter { annotationImages =>
+        annotationImages.exists(_.text.nonEmpty)
+      }
+      .map { cleanAnnotationImages =>
+        val validImages = cleanAnnotationImages.filter(_.result.nonEmpty)
+        val questionAnnotations = extractInputAnnotation(validImages)
+
+        getModelIfNotSet.predict(
+          validImages,
+          questionAnnotations,
+          $(batchSize),
+          $(maxSentenceLength))
+      }
+  }
+
+  private def extractInputAnnotation(
+      annotationImages: Array[AnnotationImage]): Seq[Annotation] = {
+    val questions = annotationImages.map(annotationImage => Annotation(annotationImage.text))
+    val sentenceAnnotations =
+      new SentenceDetector().setInputCols("document").setOutputCol("sentence")
+    val sentencesQuestions = sentenceAnnotations.annotate(questions)
+
+    val tokenizerAnnotation = new RegexTokenizer().setInputCols("sentence").setOutputCol("token")
+    val tokenQuestions = tokenizerAnnotation.annotate(sentencesQuestions)
+
+    sentencesQuestions ++ tokenQuestions
+  }
+
+  override def onWrite(path: String, spark: SparkSession): Unit = {
+    super.onWrite(path, spark)
+    writeTensorflowModelV2(
+      path,
+      spark,
+      getModelIfNotSet.tensorflowWrapper,
+      "_image_qa",
+      BLIPForQuestionAnswering.tfFile,
+      configProtoBytes = getConfigProtoBytes)
+  }
+
+}
+
+trait ReadablePretrainedBLIPForQuestionAnswering
+    extends ParamsAndFeaturesReadable[BLIPForQuestionAnswering]
+    with HasPretrained[BLIPForQuestionAnswering] {
+
+  override val defaultModelName: Some[String] = Some("blip_vqa_base")
+
+  /** Java compliant-overrides */
+  override def pretrained(): BLIPForQuestionAnswering = super.pretrained()
+
+  override def pretrained(name: String): BLIPForQuestionAnswering =
+    super.pretrained(name)
+
+  override def pretrained(name: String, lang: String): BLIPForQuestionAnswering =
+    super.pretrained(name, lang)
+
+  override def pretrained(
+      name: String,
+      lang: String,
+      remoteLoc: String): BLIPForQuestionAnswering =
+    super.pretrained(name, lang, remoteLoc)
+
+}
+
+trait ReadBLIPForQuestionAnsweringDLModel extends ReadTensorflowModel {
+  this: ParamsAndFeaturesReadable[BLIPForQuestionAnswering] =>
+  override val tfFile: String = "blip_vqa_tensorflow"
+
+  def readModel(instance: BLIPForQuestionAnswering, path: String, spark: SparkSession): Unit = {
+    val tf = readTensorflowModel(path, spark, "_blip_vqa_tf", initAllTables = false)
+
+    val preprocessor = Preprocessor(
+      do_normalize = true,
+      do_resize = true,
+      "BLIPFeatureExtractor",
+      instance.getImageMean,
+      instance.getImageStd,
+      instance.getResample,
+      instance.getSize)
+
+    instance.setModelIfNotSet(spark, preprocessor, tf)
+  }
+
+  addReader(readModel)
+
+  def loadSavedModel(modelPath: String, spark: SparkSession): BLIPForQuestionAnswering = {
+    val (localModelPath, detectedEngine) = modelSanityCheck(modelPath)
+    val preprocessorConfigJsonContent =
+      loadJsonStringAsset(localModelPath, "preprocessor_config.json")
+    val preprocessorConfig = Preprocessor.loadPreprocessorConfig(preprocessorConfigJsonContent)
+    val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap
+
+    val annotatorModel = new BLIPForQuestionAnswering()
+    annotatorModel.set(annotatorModel.engine, detectedEngine)
+
+    detectedEngine match {
+      case TensorFlow.name =>
+        val (wrapper, signatures) =
+          TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true)
+
+        val _signatures = signatures match {
+          case Some(s) => s
+          case None => throw new Exception("Cannot load signature definitions from model!")
+        }
+
+        /** the order of setSignatures is important if we use getSignatures inside
+          * setModelIfNotSet
+          */
+        annotatorModel
+          .setVocabulary(vocabs)
+          .setSignatures(_signatures)
+          .setModelIfNotSet(spark, preprocessorConfig, wrapper)
+          .setSize(384)
+
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+
+    annotatorModel
+  }
+}
+
+object BLIPForQuestionAnswering
+    extends ReadablePretrainedBLIPForQuestionAnswering
+    with ReadBLIPForQuestionAnsweringDLModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala
new file mode 100644
index 00000000000000..d3650367bbe1cf
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BertTokenizer.scala
@@ -0,0 +1,81 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
+
+import com.johnsnowlabs.nlp.annotators.common.WordpieceTokenizedSentence
+import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer
+
+import java.nio.charset.Charset
+import scala.collection.mutable.ListBuffer
+
+class BertTokenizer(val vocab: Map[String, Int], val specialTokens: SpecialTokens)
+    extends BasicTokenizer {
+
+  /** Encode the input sequence to indexes IDs adding padding where necessary */
+  def encode(
+      sentences: Seq[(WordpieceTokenizedSentence, Int)],
+      maxSequenceLength: Int): Seq[Array[Int]] = {
+    val maxSentenceLength =
+      Array(
+        maxSequenceLength - 2,
+        sentences.map { case (wpTokSentence, _) =>
+          wpTokSentence.tokens.length
+        }.max).min
+
+    sentences
+      .map { case (wpTokSentence, _) =>
+        val tokenPieceIds = wpTokSentence.tokens.map(t => t.pieceId)
+        val padding = Array.fill(maxSentenceLength - tokenPieceIds.length)(specialTokens.pad.id)
+
+        Array(specialTokens.sentenceStart.id) ++ tokenPieceIds.take(maxSentenceLength) ++ Array(
+          specialTokens.sentenceEnd.id) ++ padding
+      }
+  }
+
+  def decodeTokens(tokens: Array[Int]): String = {
+    val specialTokens = SpecialTokens.getSpecialTokensForModel("bert", vocab)
+    val decoderVocab: Map[Int, String] = vocab.map(x => (x._2, x._1))
+    val unicodeToByteMapping: Map[String, Int] =
+      bytesToUnicodeMapping.map(x => (x._2, x._1))
+    val text = tokens
+      .map(token => decoderVocab.getOrElse(token, ""))
+      .filter(x => !specialTokens.contains(x))
+      .mkString("")
+    val bytes = text.map(x => unicodeToByteMapping(x.toString)).map(x => x.toByte).toArray
+    new String(bytes, Charset.forName("UTF-8"))
+  }
+
+  /** Mapping for bytes to a different set of unicode characters (especially white spaces). This
+    * improved model performance for gpt-2
+    */
+  protected val bytesToUnicodeMapping: Map[Int, String] = {
+    val bytes: ListBuffer[Int] =
+      ListBuffer.range('!', '~' + 1) ++ ListBuffer.range('¡', '¬' + 1) ++ ListBuffer
+        .range('®', 'ÿ' + 1)
+    val characters: ListBuffer[Int] = bytes.clone
+    var n = 0
+    for (b <- 0 to 256) {
+      if (!bytes.contains(b)) {
+        bytes += b
+        characters += (256 + n)
+        n += 1
+      }
+    }
+    (bytes zip characters.map(_.toChar.toString)).toMap
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
index eb2769a4ad7458..4afb1d5b9bf18c 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
@@ -170,6 +170,14 @@ private[johnsnowlabs] object SpecialTokens {
           unkTokenString = "<|endoftext|>",
           maskTokenString = "<|endoftext|>",
           padTokenString = "<|endoftext|>")
+      case "bert" =>
+        SpecialTokens(
+          vocab,
+          startTokenString = "[CLS]",
+          endTokenString = "[SEP]",
+          unkTokenString = "[UNK]",
+          maskTokenString = "[MASK]",
+          padTokenString = "[PAD]")
     }
 }
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala
index 59747ec2c14f21..53ab187d6eca16 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/PretrainedPipeline.scala
@@ -119,7 +119,7 @@ case class PretrainedPipeline(
   }
 
   def fullAnnotateImage(pathToImages: Array[String]): Array[Map[String, Seq[IAnnotation]]] = {
-    lightModel.fullAnnotateImage(pathToImages)
+    lightModel.fullAnnotateImages(pathToImages)
   }
 
   def fullAnnotate(audio: Array[Float]): Map[String, Seq[IAnnotation]] = {
@@ -157,9 +157,14 @@ case class PretrainedPipeline(
     lightModel.fullAnnotateImageJava(pathToImage)
   }
 
-  def fullAnnotateImageJava(pathToImages: java.util.ArrayList[String])
+  def fullAnnotateImageJava(
+      pathToImages: java.util.ArrayList[String],
+      texts: java.util.ArrayList[String])
       : java.util.List[java.util.Map[String, java.util.List[IAnnotation]]] = {
-    lightModel.fullAnnotateJava(pathToImages)
+    if (texts.isEmpty) {
+      lightModel.fullAnnotateJava(pathToImages)
+    } else lightModel.fullAnnotateImageJava(pathToImages, texts)
+
   }
 
   def fullAnnotateSingleAudioJava(
diff --git a/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala b/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala
index d1991a8c5db95a..423cb03f8929ed 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/AssertAnnotations.scala
@@ -105,9 +105,10 @@ object AssertAnnotations {
     val mode = columnName + ".mode"
     val result = columnName + ".result"
     val metadata = columnName + ".metadata"
+    val text = columnName + ".text"
 
     dataSet
-      .select(annotatorType, origin, height, width, nChannels, mode, result, metadata)
+      .select(annotatorType, origin, height, width, nChannels, mode, result, metadata, text)
       .rdd
       .map { row =>
         val annotatorTypeSeq: Seq[String] = row
@@ -134,6 +135,9 @@ object AssertAnnotations {
         val metadataSeq: Seq[Map[String, String]] = row
           .getAs[Map[String, String]]("metadata")
           .asInstanceOf[mutable.WrappedArray[Map[String, String]]]
+        val textSeq: Seq[String] = row
+          .getAs[String]("text")
+          .asInstanceOf[mutable.WrappedArray[String]]
 
         originSeq.zipWithIndex.map { case (origin, index) =>
           AnnotationImage(
@@ -144,7 +148,8 @@ object AssertAnnotations {
             nChannelsSeq(index),
             modeSeq(index),
             resultSeq(index).asInstanceOf[Array[Byte]],
-            metadataSeq(index))
+            metadataSeq(index),
+            textSeq(index))
         }
       }
       .collect()
diff --git a/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala b/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala
index d9baaf6fa38a82..0161fbdff4e35c 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/ImageAssemblerTest.scala
@@ -21,6 +21,7 @@ import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import com.johnsnowlabs.tags.{FastTest, SlowTest}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.lit
 import org.scalatest.flatspec.AnyFlatSpec
 
 class ImageAssemblerTest extends AnyFlatSpec {
@@ -42,9 +43,32 @@ class ImageAssemblerTest extends AnyFlatSpec {
     val assembled = imageAssembler.transform(dataFrame)
 
     val result = AssertAnnotations.getActualImageResult(assembled, "image_assembler")
-
     assert(result.nonEmpty)
+    result.foreach(annotationImages =>
+      annotationImages.foreach { annotationImage =>
+        assert(annotationImage.annotatorType == IMAGE)
+        assert(annotationImage.origin.contains(imagesPath))
+        assert(annotationImage.height >= 0)
+        assert(annotationImage.width >= 0)
+        assert(annotationImage.nChannels >= 0)
+        assert(annotationImage.mode >= 0)
+        assert(annotationImage.result.nonEmpty)
+        assert(annotationImage.metadata.nonEmpty)
+        assert(annotationImage.text.isEmpty)
+      })
+  }
+
+  it should "work with text column" taggedAs FastTest in {
+
+    val testDF: DataFrame = dataFrame.withColumn("text", lit("What's this picture about?"))
+    val imageAssembler: ImageAssembler = new ImageAssembler()
+      .setInputCol("image")
+      .setOutputCol("image_assembler")
+
+    val assembled = imageAssembler.transform(testDF)
 
+    val result = AssertAnnotations.getActualImageResult(assembled, "image_assembler")
+    assert(result.nonEmpty)
     result.foreach(annotationImages =>
       annotationImages.foreach { annotationImage =>
         assert(annotationImage.annotatorType == IMAGE)
@@ -55,6 +79,7 @@ class ImageAssemblerTest extends AnyFlatSpec {
         assert(annotationImage.mode >= 0)
         assert(annotationImage.result.nonEmpty)
         assert(annotationImage.metadata.nonEmpty)
+        assert(annotationImage.text.nonEmpty)
       })
   }
 
@@ -82,7 +107,7 @@ class ImageAssemblerTest extends AnyFlatSpec {
     val pipeline: Pipeline = new Pipeline().setStages(Array(imageAssembler))
     val pipelineModel = pipeline.fit(emptyDF)
     val lightPipeline = new LightPipeline(pipelineModel)
-    val result = lightPipeline.fullAnnotateImage(images)
+    val result = lightPipeline.fullAnnotateImages(images)
 
     assert(result.length == images.length)
     result.foreach(annotation => assert(annotation("image_assembler").nonEmpty))
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala
new file mode 100644
index 00000000000000..d511151316ce96
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/BLIPForQuestionAnsweringTest.scala
@@ -0,0 +1,174 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.cv
+
+import com.johnsnowlabs.nlp.base.LightPipeline
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.nlp.{Annotation, AssertAnnotations, ImageAssembler}
+import com.johnsnowlabs.tags.SlowTest
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions.lit
+import org.scalatest.flatspec.AnyFlatSpec
+
+class BLIPForQuestionAnsweringTest extends AnyFlatSpec {
+
+  lazy val model = getBLIPForQuestionAnsweringPipelineModel
+
+  "BLIP" should "answer a question for a given image" taggedAs SlowTest in {
+
+    val testDF = getTestDF
+    val result = model.transform(testDF)
+
+    val answerAnnotation = AssertAnnotations.getActualResult(result, "answer")
+
+    answerAnnotation.foreach { annotation =>
+      annotation.foreach(a => assert(a.result.nonEmpty))
+    }
+  }
+
+  it should "work with light pipeline annotate" taggedAs SlowTest in {
+    val lightPipeline = new LightPipeline(model)
+    val imagePath = "src/test/resources/image/egyptian_cat.jpeg"
+    val resultAnnotate = lightPipeline.annotate(imagePath, "What's this picture about?")
+    println(s"resultAnnotate: $resultAnnotate")
+
+    assert(resultAnnotate("answer").head.contains("cat"))
+  }
+
+  it should "work with light pipeline full annotate" taggedAs SlowTest in {
+    val lightPipeline = new LightPipeline(model)
+    val imagePath = "src/test/resources/image/bluetick.jpg"
+    val resultFullAnnotate =
+      lightPipeline.fullAnnotateImage(imagePath, "What's this picture about?")
+
+    val answerAnnotation = resultFullAnnotate("answer").head.asInstanceOf[Annotation]
+
+    println(s"imageName.result: ${answerAnnotation.result}")
+    assert(answerAnnotation.result.nonEmpty)
+  }
+
+  it should "fullAnnotate with empty Map when a text is empty" taggedAs SlowTest in {
+    val lightPipeline = new LightPipeline(model)
+    val imagesPath = Array(
+      "src/test/resources/image/bluetick.jpg",
+      "src/test/resources/image/chihuahua.jpg",
+      "src/test/resources/image/egyptian_cat.jpeg")
+    val question = "What's this picture about?"
+    val questions = Array(question, "", question)
+
+    val resultFullAnnotate = lightPipeline.fullAnnotateImages(imagesPath, questions)
+
+    resultFullAnnotate.zip(imagesPath).foreach { case (annotateMap, imagePath) =>
+      imagePath match {
+        case "src/test/resources/image/chihuahua.jpg" =>
+          // For the chihuahua image, the annotateMap should be empty because the question is empty
+          assert(
+            annotateMap.isEmpty,
+            s"Expected empty map for image: $imagePath, but got: $annotateMap")
+
+        case _ =>
+          assert(annotateMap.nonEmpty, s"Expected non-empty map for image: $imagePath")
+
+          annotateMap.get("answer") match {
+            case Some(annotations) =>
+              annotations.foreach { iAnnotation =>
+                val annotation = iAnnotation.asInstanceOf[Annotation]
+                assert(
+                  annotation.result.nonEmpty,
+                  s"Expected non-empty result for image: $imagePath, but got empty result")
+              }
+            case None =>
+              fail(s"'answer' key not found in annotateMap for image: $imagePath")
+          }
+      }
+    }
+  }
+
+  it should "annotate with empty Map when a text is empty" taggedAs SlowTest in {
+    val lightPipeline = new LightPipeline(model)
+    val imagesPath = Array(
+      "src/test/resources/image/bluetick.jpg",
+      "src/test/resources/image/chihuahua.jpg",
+      "src/test/resources/image/egyptian_cat.jpeg")
+    val question = "What's this picture about?"
+    val questions = Array(question, "", question)
+
+    val resultAnnotate = lightPipeline.annotate(imagesPath, questions)
+
+    resultAnnotate.foreach { annotate =>
+      println(s"annotate: $annotate")
+    }
+
+    resultAnnotate.zip(imagesPath).foreach { case (annotateMap, imagePath) =>
+      imagePath match {
+        case "src/test/resources/image/chihuahua.jpg" =>
+          // For the chihuahua image, the annotateMap should be empty because the question is empty
+          assert(
+            annotateMap.isEmpty,
+            s"Expected empty map for image: $imagePath, but got: $annotateMap")
+
+        case _ =>
+          assert(annotateMap.nonEmpty, s"Expected non-empty map for image: $imagePath")
+
+          annotateMap.get("answer") match {
+            case Some(annotations) =>
+              annotations.foreach { annotation =>
+                assert(
+                  annotation.nonEmpty,
+                  s"Expected non-empty result for image: $imagePath, but got empty result")
+              }
+            case None =>
+              fail(s"'answer' key not found in annotateMap for image: $imagePath")
+          }
+      }
+    }
+
+  }
+
+  private def getBLIPForQuestionAnsweringPipelineModel = {
+    val testDF = getTestDF
+
+    val imageAssembler: ImageAssembler = new ImageAssembler()
+      .setInputCol("image")
+      .setOutputCol("image_assembler")
+
+    val loadModel = BLIPForQuestionAnswering
+      .pretrained()
+      .setInputCols("image_assembler")
+      .setOutputCol("answer")
+      .setSize(384)
+
+    val newPipeline: Pipeline =
+      new Pipeline().setStages(Array(imageAssembler, loadModel))
+
+    newPipeline.fit(testDF)
+  }
+
+  private def getTestDF: DataFrame = {
+    val imageFolder = "src/test/resources/image/"
+    val imageDF: DataFrame = ResourceHelper.spark.read
+      .format("image")
+      .option("dropInvalid", value = true)
+      .load(imageFolder)
+
+    val testDF: DataFrame = imageDF.withColumn("text", lit("What's this picture about?"))
+
+    testDF
+  }
+
+}
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala
index 85b43a790634ab..92491fc1abddac 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/CLIPForZeroShotClassificationTestSpec.scala
@@ -74,7 +74,7 @@ class CLIPForZeroShotClassificationTestSpec extends AnyFlatSpec {
     val pipelineModel = pipeline.fit(imageDF)
     val lightPipeline = new LightPipeline(pipelineModel)
     val images = expected.keys.map(imageFolder + _).toArray
-    val result = lightPipeline.fullAnnotateImage(images)
+    val result = lightPipeline.fullAnnotateImages(images)
 
     result.foreach { row: Map[String, Seq[IAnnotation]] =>
       val imageName =
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala
index fdf2e43b574a81..0eacd5378bde6f 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/ViTImageClassificationTestSpec.scala
@@ -159,7 +159,7 @@ trait ViTForImageClassificationBehaviors { this: AnyFlatSpec =>
 
       val images =
         Array("src/test/resources/image/hen.JPEG", "src/test/resources/image/missing_file.mf")
-      val predictions = lightPipeline.fullAnnotateImage(images)
+      val predictions = lightPipeline.fullAnnotateImages(images)
 
       assert(predictions(0)("image_assembler").nonEmpty)
       assert(predictions(0)("class").nonEmpty)
@@ -185,7 +185,7 @@ trait ViTForImageClassificationBehaviors { this: AnyFlatSpec =>
 
       val images =
         Array("src/test/resources/image/hen.JPEG", "this is a text")
-      val predictions = lightPipeline.fullAnnotateImage(images)
+      val predictions = lightPipeline.fullAnnotateImages(images)
 
       assert(predictions(0)("image_assembler").nonEmpty)
       assert(predictions(0)("class").nonEmpty)
@@ -232,7 +232,7 @@ class ViTImageClassificationTestSpec extends AnyFlatSpec with ViTForImageClassif
       "tractor.JPEG" -> "tractor",
       "ox.JPEG" -> "ox")
 
-  private lazy val model: ViTForImageClassification = ViTForImageClassification.pretrained()
+  private val model: ViTForImageClassification = ViTForImageClassification.pretrained()
 
   it should behave like
     behaviorsViTForImageClassification[ViTForImageClassification](
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala
index 64aae2c9d330b9..b67e2684ea432a 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/cv/VisionEncoderDecoderForImageCaptioningTestSpec.scala
@@ -88,7 +88,7 @@ class VisionEncoderDecoderForImageCaptioningTestSpec extends AnyFlatSpec {
       val pipelineModel = pipeline.fit(imageDF)
       val lightPipeline = new LightPipeline(pipelineModel)
       val image = imageFolder + "egyptian_cat.jpeg"
-      val results = lightPipeline.fullAnnotateImage(Array(image, image))
+      val results = lightPipeline.fullAnnotateImages(Array(image, image))
 
       results.foreach { result =>
         assert(result("image_assembler").nonEmpty)