From 7bbabbc4e80b1d8753b8cf4822a42745e2186194 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Mon, 2 Sep 2024 07:44:16 +0500 Subject: [PATCH 1/5] adding OpenVino support to CamembertForXXX --- ...rk_NLP_CamemBertForQuestionAnswering.ipynb | 2318 ++++++++++++++++ ...P_CamemBertForSequenceClassification.ipynb | 551 ++++ ..._NLP_CamemBertForTokenClassification.ipynb | 2429 +++++++++++++++++ .../ml/ai/CamemBertClassification.scala | 280 +- 4 files changed, 5412 insertions(+), 166 deletions(-) create mode 100644 examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb create mode 100644 examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb create mode 100644 examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb new file mode 100644 index 00000000000000..5bfd0095e9d106 --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb @@ -0,0 +1,2318 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_V5XcDCnVgSi" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForQuestionAnswering.ipynb)\n", + "\n", + "# Import OpenVINO CamemBertForQuestionAnswering models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and exporting CamemBertForQuestionAnswering models from HuggingFace for use in Spark NLP, leveraging the various tools provided in the [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) ecosystem.\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for CamemBertForQuestionAnswering from CamemBertForQuestionAnswering and they have to be in `Question Answering` category." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aghasVppVgSk" + }, + "source": [ + "## 1. Export and Save the HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be4HsTDMVgSk" + }, + "source": [ + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-7L-2ZWUVgSl", + "outputId": "3ca3be73-42ac-480e-dfea-8f870fba8843" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m572.4 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m19.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.7/38.7 MB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.7/215.7 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m421.5/421.5 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.9/39.9 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m81.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m39.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-api-core 2.19.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-aiplatform 1.63.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-storage 2.25.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-datastore 2.19.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-firestore 2.16.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-pubsub 2.23.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "googleapis-common-protos 1.64.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.41.2\n", + "!pip install -q --upgrade openvino==2024.1\n", + "!pip install -q --upgrade optimum-intel==1.17.0\n", + "!pip install -q --upgrade onnx==1.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vI7uz_6hVgSl" + }, + "source": [ + "[Optimum Intel](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#openvino) is the interface between the Transformers library and the various model optimization and acceleration tools provided by Intel. HuggingFace models loaded with optimum-intel are automatically optimized for OpenVINO, while being compatible with the Transformers API.\n", + "- To load a HuggingFace model directly for inference/export, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. We can use this to import and export OpenVINO models with `from_pretrained` and `save_pretrained`.\n", + "- By setting `export=True`, the source model is converted to OpenVINO IR format on the fly.\n", + "- We'll use [etalab-ia/camembert-base-squadFR-fquad-piaf](https://huggingface.co/etalab-ia/camembert-base-squadFR-fquad-piaf) model from HuggingFace as an example and load it as a `OVModelForQuestionAnswering`, representing an OpenVINO model.\n", + "- In addition to the OVModelForQuestionAnswering model, we also need to save the `AutoTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 654, + "referenced_widgets": [ + "b1081789b9c1420799c9d6a913a59383", + "4dc79d1a123c49eca4a261cdf4e0bd57", + "eab4e4bad57d4fd2ba8f6d1831758f7d", + "8bfc817cf7ab45bfb64dd3286b7430d6", + "c0b72c2e75104483af2c0a187a4bc084", + "1c0efac1ef03403cb0334326b555ba54", + "0798f788965f43c0b897600266d31600", + "b3736e95f3c644b097413ed818bf6a46", + "a036b2fbaa6b4af7b4767aa37bfa0aa8", + "47057927a7d74304a1c9ded7059d5c3e", + "e4bbb56ba1664157a440a1c320781555", + "33cea0738d6b42a285be912435f64663", + "ac667dc780a44cc3b10750cf5be82aa1", + "3f86fe0139a24bacab113d117cf6efa5", + "06c4d35426974827b7a384b3ad6fe18b", + "868a0c4e84a34e1cbf2bdda5c5abd951", + "191aa75a4efa4f079b42e53c19c75772", + "893892ced90743f2ad6d4cd344839d9c", + "76e1070d2fef436c9befa7e52cb07f65", + "92baae064b55425c9ea5ef9283b0f327", + "70fe850813b346ca9b204cb4b9703e0a", + "48db767683f648d09287f31d155530b6", + "9a7ca2b5646748f7af9bb67c25e9aca4", + "5d6f086f1a6c4160bcd6b30e1e9f7f82", + "86e6c78d06b54c3aa28336e7e7afb856", + "d74ae3ee0b814132ad7500950cbca273", + "f306011363084d61b59f9349f631d416", + "b1bd1cd6863c48da865a9155f16da073", + "b8a6d9de3f2e4712979cf0647002ba29", + "bb6e22e6c1484479b025bf5a2135befe", + "27b4d3b46ba940959094de15a8e8cc66", + "829ae4c4f0c2410d89529787bd71db2a", + "47e03cf4784e4ed98c804d59342c5dcb", + "ce7f2d28ea814ff3886a159bd6656b05", + "7bc7b7a813aa4107bdc828661e06b561", + "a18570ba476240728285d05f364e3fe0", + "802603b888a0474f9c5dcd8a527eba49", + "c81d2f551a04404983dd93acbd200cca", + "612fe82a8d574c25b3f6c9399b682f2c", + "2829b9562a1345e5bc6be60bfd09a382", + "dad17915571b43d79d5ea9bd01a0cf94", + "1f6ad17118214b35abdb153ff2a056ed", + "812d6cca409f49a6b47c6c1d121e71c1", + "a47f4a22ff804e9fbbf1e592b8d0e00e", + "e2aa185233634b3298c250332bf8705c", + "a9ee9b6c1bb74e4ca4198443caf6a86f", + "5ba1afef08854ea0934e67da23245155", + "1959ffc7f4514ece8c5abd2bf67ecd45", + "c6a2d555014b481f9557aa02eac68f3b", + "7ad9aaf6b8f34edca61429d7aca2c8c8", + "ccfaa864abad4ed8871873487c71b93a", + "5e7c88da09a64d7b9f0271cb4d551931", + "3b5d643809ac4dab978951aee12e5bee", + "954fe45102bd46c5b178f052acb4bd47", + "4a2a62fc28134a78b778834e101c911f" + ] + }, + "id": "qF5Pp3DuVgSm", + "outputId": "8f45c743-b4be-48ab-d0ca-2201b7494066" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/515 [00:00 False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from optimum.intel import OVModelForQuestionAnswering\n", + "from transformers import AutoTokenizer\n", + "\n", + "MODEL_NAME = \"etalab-ia/camembert-base-squadFR-fquad-piaf\"\n", + "EXPORT_PATH = f\"ov_models/{MODEL_NAME}\"\n", + "\n", + "ov_model = OVModelForQuestionAnswering.from_pretrained(MODEL_NAME, export=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "\n", + "# Save the OpenVINO model\n", + "ov_model.save_pretrained(EXPORT_PATH)\n", + "tokenizer.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "code", + "source": [ + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets" + ], + "metadata": { + "id": "MpN0X482erNw" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pOw43Yc1IOIk" + }, + "source": [ + "## Import and Save CamemBertForQuestionAnswering in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "88JjaGLRIOIl" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tIT4s8QWIOIl", + "outputId": "71d2d324-daec-4ee9-ac2c-663cd5472883" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.4\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.4\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m540.7/540.7 kB\u001b[0m \u001b[31m48.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OlfoFLR5IOIl" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bqi2I04RIOIl", + "outputId": "20805c5d-bda5-409a-9256-7379ee9e744f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IUhzyzQCIOIl" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `CamemBertForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `CamemBertForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cZ3W4Su6IOIm" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spanClassifier = CamemBertForQuestionAnswering.loadSavedModel(\n", + " f\"{EXPORT_PATH}\",\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\\\n", + " .setCaseSensitive(False)\\\n", + " .setMaxSentenceLength(512)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1yxcMiRvIOIm" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "FlTdYTxAIOIm" + }, + "outputs": [], + "source": [ + "spanClassifier.write().overwrite().save(\"./{}_spark_nlp_openvino\".format(EXPORT_PATH))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "92lNkNGnIOIm" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sdQI3F4oIOIm" + }, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jUN-57PfIOIm" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your CamemBertForQuestionAnswering model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1QUDETlIIOIn", + "outputId": "7c55ff07-a5db-45e2-e6cc-c9f4d1f015a8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 430960\n", + "-rw-r--r-- 1 root root 440480388 Nov 3 13:41 camembert_classification_onnx\n", + "-rw-r--r-- 1 root root 810912 Nov 3 13:41 camembert_spp\n", + "drwxr-xr-x 2 root root 4096 Nov 3 13:41 metadata\n" + ] + } + ], + "source": [ + "! ls -l {EXPORT_PATH}_spark_nlp_openvino" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gjGhPpjLIOIn" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForQuestionAnswering model in Spark NLP πŸš€ pipeline!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "CHsM9KnyIOIn", + "outputId": "84c166f7-c1b3-4916-b422-351476b2a0e1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------+--------+\n", + "|question |result |\n", + "+---------------------+--------+\n", + "|OΓΉ est-ce que je vis?|[berlin]|\n", + "+---------------------+--------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = MultiDocumentAssembler() \\\n", + " .setInputCols([\"question\", \"context\"]) \\\n", + " .setOutputCols([\"document_question\", \"document_context\"])\n", + "\n", + "spanClassifier_loaded = CamemBertForQuestionAnswering.load(\"./{}_spark_nlp_openvino\".format(EXPORT_PATH))\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\n", + "\n", + "pipeline = Pipeline().setStages([\n", + " document_assembler,\n", + " spanClassifier_loaded\n", + "])\n", + "\n", + "context = \"Mon nom est Wolfgang et je vis Γ  Berlin\"\n", + "question = \"OΓΉ est-ce que je vis?\"\n", + "\n", + "example = spark.createDataFrame([[question, context]]).toDF(\"question\", \"context\")\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "result.select(\"question\", \"answer.result\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FWPCPTnWIOIn" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `CamemBertForQuestionAnswering` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "b1081789b9c1420799c9d6a913a59383": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4dc79d1a123c49eca4a261cdf4e0bd57", + "IPY_MODEL_eab4e4bad57d4fd2ba8f6d1831758f7d", + "IPY_MODEL_8bfc817cf7ab45bfb64dd3286b7430d6" + ], + "layout": "IPY_MODEL_c0b72c2e75104483af2c0a187a4bc084" + } + }, + "4dc79d1a123c49eca4a261cdf4e0bd57": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1c0efac1ef03403cb0334326b555ba54", + "placeholder": "​", + "style": "IPY_MODEL_0798f788965f43c0b897600266d31600", + "value": "config.json: 100%" + } + }, + "eab4e4bad57d4fd2ba8f6d1831758f7d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b3736e95f3c644b097413ed818bf6a46", + "max": 515, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_a036b2fbaa6b4af7b4767aa37bfa0aa8", + "value": 515 + } + }, + "8bfc817cf7ab45bfb64dd3286b7430d6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_47057927a7d74304a1c9ded7059d5c3e", + "placeholder": "​", + "style": "IPY_MODEL_e4bbb56ba1664157a440a1c320781555", + "value": " 515/515 [00:00<00:00, 9.70kB/s]" + } + }, + "c0b72c2e75104483af2c0a187a4bc084": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1c0efac1ef03403cb0334326b555ba54": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0798f788965f43c0b897600266d31600": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "b3736e95f3c644b097413ed818bf6a46": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a036b2fbaa6b4af7b4767aa37bfa0aa8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "47057927a7d74304a1c9ded7059d5c3e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e4bbb56ba1664157a440a1c320781555": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "33cea0738d6b42a285be912435f64663": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ac667dc780a44cc3b10750cf5be82aa1", + "IPY_MODEL_3f86fe0139a24bacab113d117cf6efa5", + "IPY_MODEL_06c4d35426974827b7a384b3ad6fe18b" + ], + "layout": "IPY_MODEL_868a0c4e84a34e1cbf2bdda5c5abd951" + } + }, + "ac667dc780a44cc3b10750cf5be82aa1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_191aa75a4efa4f079b42e53c19c75772", + "placeholder": "​", + "style": "IPY_MODEL_893892ced90743f2ad6d4cd344839d9c", + "value": "model.safetensors: 100%" + } + }, + "3f86fe0139a24bacab113d117cf6efa5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_76e1070d2fef436c9befa7e52cb07f65", + "max": 442518070, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_92baae064b55425c9ea5ef9283b0f327", + "value": 442518070 + } + }, + "06c4d35426974827b7a384b3ad6fe18b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_70fe850813b346ca9b204cb4b9703e0a", + "placeholder": "​", + "style": "IPY_MODEL_48db767683f648d09287f31d155530b6", + "value": " 443M/443M [00:02<00:00, 178MB/s]" + } + }, + "868a0c4e84a34e1cbf2bdda5c5abd951": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "191aa75a4efa4f079b42e53c19c75772": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "893892ced90743f2ad6d4cd344839d9c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "76e1070d2fef436c9befa7e52cb07f65": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "92baae064b55425c9ea5ef9283b0f327": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "70fe850813b346ca9b204cb4b9703e0a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "48db767683f648d09287f31d155530b6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "9a7ca2b5646748f7af9bb67c25e9aca4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5d6f086f1a6c4160bcd6b30e1e9f7f82", + "IPY_MODEL_86e6c78d06b54c3aa28336e7e7afb856", + "IPY_MODEL_d74ae3ee0b814132ad7500950cbca273" + ], + "layout": "IPY_MODEL_f306011363084d61b59f9349f631d416" + } + }, + "5d6f086f1a6c4160bcd6b30e1e9f7f82": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b1bd1cd6863c48da865a9155f16da073", + "placeholder": "​", + "style": "IPY_MODEL_b8a6d9de3f2e4712979cf0647002ba29", + "value": "tokenizer_config.json: 100%" + } + }, + "86e6c78d06b54c3aa28336e7e7afb856": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_bb6e22e6c1484479b025bf5a2135befe", + "max": 24, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_27b4d3b46ba940959094de15a8e8cc66", + "value": 24 + } + }, + "d74ae3ee0b814132ad7500950cbca273": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_829ae4c4f0c2410d89529787bd71db2a", + "placeholder": "​", + "style": "IPY_MODEL_47e03cf4784e4ed98c804d59342c5dcb", + "value": " 24.0/24.0 [00:00<00:00, 151B/s]" + } + }, + "f306011363084d61b59f9349f631d416": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b1bd1cd6863c48da865a9155f16da073": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b8a6d9de3f2e4712979cf0647002ba29": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bb6e22e6c1484479b025bf5a2135befe": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "27b4d3b46ba940959094de15a8e8cc66": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "829ae4c4f0c2410d89529787bd71db2a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "47e03cf4784e4ed98c804d59342c5dcb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ce7f2d28ea814ff3886a159bd6656b05": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7bc7b7a813aa4107bdc828661e06b561", + "IPY_MODEL_a18570ba476240728285d05f364e3fe0", + "IPY_MODEL_802603b888a0474f9c5dcd8a527eba49" + ], + "layout": "IPY_MODEL_c81d2f551a04404983dd93acbd200cca" + } + }, + "7bc7b7a813aa4107bdc828661e06b561": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_612fe82a8d574c25b3f6c9399b682f2c", + "placeholder": "​", + "style": "IPY_MODEL_2829b9562a1345e5bc6be60bfd09a382", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "a18570ba476240728285d05f364e3fe0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dad17915571b43d79d5ea9bd01a0cf94", + "max": 810912, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_1f6ad17118214b35abdb153ff2a056ed", + "value": 810912 + } + }, + "802603b888a0474f9c5dcd8a527eba49": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_812d6cca409f49a6b47c6c1d121e71c1", + "placeholder": "​", + "style": "IPY_MODEL_a47f4a22ff804e9fbbf1e592b8d0e00e", + "value": " 811k/811k [00:00<00:00, 4.69MB/s]" + } + }, + "c81d2f551a04404983dd93acbd200cca": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "612fe82a8d574c25b3f6c9399b682f2c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2829b9562a1345e5bc6be60bfd09a382": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "dad17915571b43d79d5ea9bd01a0cf94": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f6ad17118214b35abdb153ff2a056ed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "812d6cca409f49a6b47c6c1d121e71c1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a47f4a22ff804e9fbbf1e592b8d0e00e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e2aa185233634b3298c250332bf8705c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_a9ee9b6c1bb74e4ca4198443caf6a86f", + "IPY_MODEL_5ba1afef08854ea0934e67da23245155", + "IPY_MODEL_1959ffc7f4514ece8c5abd2bf67ecd45" + ], + "layout": "IPY_MODEL_c6a2d555014b481f9557aa02eac68f3b" + } + }, + "a9ee9b6c1bb74e4ca4198443caf6a86f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7ad9aaf6b8f34edca61429d7aca2c8c8", + "placeholder": "​", + "style": "IPY_MODEL_ccfaa864abad4ed8871873487c71b93a", + "value": "special_tokens_map.json: 100%" + } + }, + "5ba1afef08854ea0934e67da23245155": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5e7c88da09a64d7b9f0271cb4d551931", + "max": 210, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_3b5d643809ac4dab978951aee12e5bee", + "value": 210 + } + }, + "1959ffc7f4514ece8c5abd2bf67ecd45": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_954fe45102bd46c5b178f052acb4bd47", + "placeholder": "​", + "style": "IPY_MODEL_4a2a62fc28134a78b778834e101c911f", + "value": " 210/210 [00:00<00:00, 10.5kB/s]" + } + }, + "c6a2d555014b481f9557aa02eac68f3b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7ad9aaf6b8f34edca61429d7aca2c8c8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ccfaa864abad4ed8871873487c71b93a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5e7c88da09a64d7b9f0271cb4d551931": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3b5d643809ac4dab978951aee12e5bee": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "954fe45102bd46c5b178f052acb4bd47": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a2a62fc28134a78b778834e101c911f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb new file mode 100644 index 00000000000000..80e13f4b0dca0a --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb @@ -0,0 +1,551 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_V5XcDCnVgSi" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForSequenceClassification.ipynb)\n", + "\n", + "# Import OpenVINO CamemBertForSequenceClassification models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and exporting CamemBertForSequenceClassification models from HuggingFace for use in Spark NLP, leveraging the various tools provided in the [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) ecosystem.\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for CamemBertForSequenceClassification from CamemBertForSequenceClassification and they have to be in `Text Classification` category." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aghasVppVgSk" + }, + "source": [ + "## 1. Export and Save the HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be4HsTDMVgSk" + }, + "source": [ + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-7L-2ZWUVgSl", + "outputId": "a0a24d02-645e-4776-8b6b-d489ce2fba5f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.7/38.7 MB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.7/215.7 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m421.5/421.5 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m73.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.9/39.9 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m91.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-api-core 2.19.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-aiplatform 1.63.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-storage 2.25.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-datastore 2.19.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-firestore 2.16.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-pubsub 2.23.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "googleapis-common-protos 1.64.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.41.2\n", + "!pip install -q --upgrade openvino==2024.1\n", + "!pip install -q --upgrade optimum-intel==1.17.0\n", + "!pip install -q --upgrade onnx==1.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vI7uz_6hVgSl" + }, + "source": [ + "[Optimum Intel](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#openvino) is the interface between the Transformers library and the various model optimization and acceleration tools provided by Intel. HuggingFace models loaded with optimum-intel are automatically optimized for OpenVINO, while being compatible with the Transformers API.\n", + "- To load a HuggingFace model directly for inference/export, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. We can use this to import and export OpenVINO models with `from_pretrained` and `save_pretrained`.\n", + "- By setting `export=True`, the source model is converted to OpenVINO IR format on the fly.\n", + "- We'll use [antoinelouis/crossencoder-camembert-large-mmarcoFR](https://huggingface.co/antoinelouis/crossencoder-camembert-large-mmarcoFR) model from HuggingFace as an example and load it as a `OVModelForSequenceClassification`, representing an OpenVINO model.\n", + "- In addition to the OVModelForSequenceClassification model, we also need to save the `AutoTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qF5Pp3DuVgSm", + "outputId": "29a37d5a-2ffd-43de-83ef-7e814dd6f10f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "Framework not specified. Using pt to export the model.\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "Using framework PyTorch: 2.4.0+cu121\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from optimum.intel import OVModelForSequenceClassification\n", + "from transformers import AutoTokenizer\n", + "\n", + "MODEL_NAME = \"antoinelouis/crossencoder-camembert-large-mmarcoFR\"\n", + "EXPORT_PATH = f\"ov_models/{MODEL_NAME}\"\n", + "\n", + "ov_model = OVModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "\n", + "# Save the OpenVINO model\n", + "ov_model.save_pretrained(EXPORT_PATH)\n", + "tokenizer.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "code", + "source": [ + "# get label2id dictionary\n", + "labels = ov_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(EXPORT_PATH + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ], + "metadata": { + "id": "mV-zeLoUSPdB" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets" + ], + "metadata": { + "id": "MpN0X482erNw" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MklXFA9IgYGw" + }, + "source": [ + "## Import and Save CamemBertForSequenceClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YWxaWzPwgYGx" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wIWlyDh2gYGx", + "outputId": "b65facd5-2112-40ac-ea3d-728c3b588377" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-11-03 19:56:31-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2023-11-03 19:56:31-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: β€˜STDOUT’\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-11-03 19:56:31 (92.1 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.1.4\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.4\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m540.7/540.7 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m22.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y_Zap7VmgYG0" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VEQMY5l9gYG0", + "outputId": "e0011629-e7e9-4b53-ae47-e460d5c93e23" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bBkc1p8hgYG0" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `CamemBertForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `CamemBertForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9B1TdMOqgYG1" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "sequenceClassifier = CamemBertForSequenceClassification.loadSavedModel(\n", + " f\"{ONNX_MODEL}\",\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oqpVT29ugYG1" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tDZ3NElngYG1" + }, + "outputs": [], + "source": [ + "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P9Xd5_ETgYG1" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F_8UN6AdgYG1" + }, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XmMOnfb9gYG1" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your CamemBertForSequenceClassification model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aM2k8S7mgYG1", + "outputId": "aa6572a4-80dc-4948-8740-14c985ab44f8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 433456\n", + "-rw-r--r-- 1 root root 443034301 Nov 3 20:00 camembert_classification_onnx\n", + "-rw-r--r-- 1 root root 810912 Nov 3 20:00 camembert_spp\n", + "drwxr-xr-x 3 root root 4096 Nov 3 19:58 fields\n", + "drwxr-xr-x 2 root root 4096 Nov 3 19:58 metadata\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b4arm8CUgYG2" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForSequenceClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K2GL0-SZgYG2" + }, + "outputs": [], + "source": [ + "sequenceClassifier_loaded = CamemBertForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "17f2GUpIgYG2" + }, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1bdTgvvfgYG2", + "outputId": "32f4b52b-0bf8-4bbb-c083-392513229595" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['NEGATIVE', 'POSITIVE']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "sequenceClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7tlb_MaogYG2" + }, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP πŸš€ pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iyLVgsgWgYG2", + "outputId": "43ae2c11-0eb6-441d-b96b-d8e742697f11" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+----------+\n", + "| text| result|\n", + "+--------------------+----------+\n", + "|Alad'2 est claire...|[POSITIVE]|\n", + "|Je m'attendais Γ  ...|[NEGATIVE]|\n", + "+--------------------+----------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.ml import Pipeline\n", + "\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " sequenceClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"Alad'2 est clairement le meilleur film de l'annΓ©e 2018.\"], [\"Je m'attendais Γ  mieux de la part de Franck Dubosc !\"]]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"class.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u8j_SE0dgYG3" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `CamemBertForSequenceClassification` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb new file mode 100644 index 00000000000000..95578eed6ac97c --- /dev/null +++ b/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb @@ -0,0 +1,2429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_V5XcDCnVgSi" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/openvino/HuggingFace_OpenVINO_in_Spark_NLP_CamemBertForTokenClassification.ipynb)\n", + "\n", + "# Import OpenVINO CamemBertForTokenClassification models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "This notebook provides a detailed walkthrough on optimizing and exporting CamemBertForTokenClassification models from HuggingFace for use in Spark NLP, leveraging the various tools provided in the [Intel OpenVINO toolkit](https://www.intel.com/content/www/us/en/developer/tools/openvino-toolkit/overview.html) ecosystem.\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- OpenVINO support was introduced in `Spark NLP 5.4.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You can import models for CamemBertForTokenClassification from CamemBertForTokenClassification and they have to be in `Token Classification` category." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aghasVppVgSk" + }, + "source": [ + "## 1. Export and Save the HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be4HsTDMVgSk" + }, + "source": [ + "- Let's install `transformers` and `openvino` packages with other dependencies. You don't need `openvino` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.41.2`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-7L-2ZWUVgSl", + "outputId": "191a9dba-e777-4064-aeb9-de8bfbae9b5e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.8/43.8 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m16.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.7/38.7 MB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.7/215.7 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m527.3/527.3 kB\u001b[0m \u001b[31m22.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m421.5/421.5 kB\u001b[0m \u001b[31m23.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.9/15.9 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m39.9/39.9 MB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 17.0.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m59.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m36.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 17.0.0 which is incompatible.\n", + "google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-api-core 2.19.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-aiplatform 1.63.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-connection 1.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigquery-storage 2.25.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-bigtable 2.26.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-datastore 2.19.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-firestore 2.16.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-functions 1.16.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-iam 2.15.2 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-language 2.13.4 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-pubsub 2.23.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-resource-manager 1.12.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "google-cloud-translate 3.15.5 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "googleapis-common-protos 1.64.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0.dev0,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "grpc-google-iam-v1 0.13.1 requires protobuf!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow 2.17.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.1 which is incompatible.\n", + "tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < \"3.11\", but you have protobuf 3.20.1 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers==4.41.2\n", + "!pip install -q --upgrade openvino==2024.1\n", + "!pip install -q --upgrade optimum-intel==1.17.0\n", + "!pip install -q --upgrade onnx==1.12.0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vI7uz_6hVgSl" + }, + "source": [ + "[Optimum Intel](https://github.com/huggingface/optimum-intel?tab=readme-ov-file#openvino) is the interface between the Transformers library and the various model optimization and acceleration tools provided by Intel. HuggingFace models loaded with optimum-intel are automatically optimized for OpenVINO, while being compatible with the Transformers API.\n", + "- To load a HuggingFace model directly for inference/export, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. We can use this to import and export OpenVINO models with `from_pretrained` and `save_pretrained`.\n", + "- By setting `export=True`, the source model is converted to OpenVINO IR format on the fly.\n", + "- We'll use [Jean-Baptiste/camembert-ner](https://huggingface.co/Jean-Baptiste/camembert-ner) model from HuggingFace as an example and load it as a `OVModelForTokenClassification`, representing an OpenVINO model.\n", + "- In addition to the OVModelForTokenClassification model, we also need to save the `AutoTokenizer`. This is the same for every model, these are assets (saved in `/assets`) needed for tokenization inside Spark NLP." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 617, + "referenced_widgets": [ + "285bad4316ae4a4eace151bfce147dc0", + "f8daa06df57a43c9afb7f2904ea78fc8", + "bcde4c6901b3424e9ad031e0de66efac", + "b6a7270744f24eb5921733fe8ffedf5b", + "8d205872f18d443c8016732620c00c83", + "872fb89102eb4bea88b10ad39ce7890e", + "29c362f1824e495489e9aae12db175c0", + "a90c4c4679114f0d8222cff96b225388", + "bf5d6e2307d9436e8ac041bc6ee4efa1", + "52dc6ea8fa0e45c8aaae33ba34eb3238", + "a4bb2bd411cf41478583c2cc941f6a9b", + "bc6c232edff34eb884ddd4550ef22385", + "c8c1a59cada94cf8a1b776c07bfb84d3", + "e78319ac2cbb4c9290d5beabbb18634f", + "23017d56938a4d93874acff6fad3e1ba", + "415df9ed80af4de8a319ed8d36ffe62b", + "52b6a52a77c4434ea2914445afd825ed", + "c8b1bb9e628e416ca7c92550ead8fffd", + "725a587f251645c989ef9cc92ff8a9df", + "5d9a35b2a3ca4be9b8338ebfd53e7ee8", + "25354f5d522547fb82c08ebaa5fc6f63", + "dbb922ac391245739a057ec33716de5d", + "8439f33f9ad44ab3a3e6b062584593e9", + "4b58681d6eed4d388f673a014c4c8970", + "571a689562df482a9059dd2561ff5b2e", + "3d3d6d4dde2d4586994402a7980083c4", + "b869e8fa74084ae2b89ee23d81ae6a5e", + "10cfb3667c5e4bc5bddc1fbd31813204", + "cad68acf72ca448d81027ee6cb0df531", + "82b70a162ac24e9f8e66b7ca8ac50bb3", + "459592e9ee7b4ec6942b00c830f8edd3", + "4d6b643d3c1341be91c0ac9249c084d6", + "826f737422e4461fb57f68891696a098", + "fb7beba8d33847fd96ee61a0773ddc83", + "7ea5643c28e1449ab427c6d2959f1c01", + "fdc89d2f42884f6ea3c8a87bdb8bb6fb", + "cc0c03d5384444759d214476c610b826", + "ccd3d08af037460c91c83adb7a618886", + "a5899d5737c5465f9bbbeb4eac096739", + "07f28bbe0fa64e93bd191f8bc389e8c6", + "a95ac0dd615a4921a4df9f88d4aef1bd", + "2601af07921241438c892be192678a5a", + "23a0b6515afb496b9215e07a48d397ea", + "91be2525e3bf44108588d7ee82d89236", + "ac2e9328379e43ee9a6dc9427eaab423", + "29dd81e967dd4860a2c9f0a5b8cb5670", + "e558f5a7ddd74dbaa89f1b190d0696f6", + "2d49b0cbc9af4866bd2719234e312c58", + "d32d0ed8c5b448519f7b211ffd306e5e", + "ee8401a67e0641ac87df3c16dfc8ca51", + "5e027f1e752a4642af7a242c1e0d814e", + "3d13f400299343e29c8ec6ccff83fb08", + "0fa6c310d2c74ca890c8b2c87ada537b", + "9412734ce6714680831d4869cae03e54", + "8b387b73275e4f26ad56c7430f10a35c" + ] + }, + "id": "qF5Pp3DuVgSm", + "outputId": "712f7828-a46e-470e-d1d3-84df90e1fa95" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/892 [00:00 False\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "Compiling the model to CPU ...\n" + ] + } + ], + "source": [ + "from optimum.intel import OVModelForTokenClassification\n", + "from transformers import AutoTokenizer\n", + "\n", + "MODEL_NAME = \"Jean-Baptiste/camembert-ner\"\n", + "EXPORT_PATH = f\"ov_models/{MODEL_NAME}\"\n", + "\n", + "ov_model = OVModelForTokenClassification.from_pretrained(MODEL_NAME, export=True)\n", + "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n", + "\n", + "# Save the OpenVINO model\n", + "ov_model.save_pretrained(EXPORT_PATH)\n", + "tokenizer.save_pretrained(EXPORT_PATH)\n", + "\n", + "# Create directory for assets and move the tokenizer files.\n", + "# A separate folder is needed for Spark NLP.\n", + "!mkdir {EXPORT_PATH}/assets" + ] + }, + { + "cell_type": "code", + "source": [ + "# get label2id dictionary\n", + "labels = ov_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(EXPORT_PATH + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ], + "metadata": { + "id": "mV-zeLoUSPdB" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!mv {EXPORT_PATH}/sentencepiece.bpe.model {EXPORT_PATH}/assets" + ], + "metadata": { + "id": "MpN0X482erNw" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!ls -lR {EXPORT_PATH}" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_9NJU39xhaIH", + "outputId": "d86ea3de-5100-428e-e60b-ac8e20e7c579" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ov_models/Jean-Baptiste/camembert-ner:\n", + "total 432616\n", + "drwxr-xr-x 2 root root 4096 Sep 2 00:45 assets\n", + "-rw-r--r-- 1 root root 936 Sep 2 00:45 config.json\n", + "-rw-r--r-- 1 root root 440145104 Sep 2 00:45 openvino_model.bin\n", + "-rw-r--r-- 1 root root 407191 Sep 2 00:45 openvino_model.xml\n", + "-rw-r--r-- 1 root root 354 Sep 2 00:45 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 1589 Sep 2 00:45 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 2421069 Sep 2 00:45 tokenizer.json\n", + "\n", + "ov_models/Jean-Baptiste/camembert-ner/assets:\n", + "total 796\n", + "-rw-r--r-- 1 root root 26 Sep 2 00:45 labels.txt\n", + "-rw-r--r-- 1 root root 810912 Sep 2 00:45 sentencepiece.bpe.model\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NySQijMkijcf" + }, + "source": [ + "## Import and Save CamemBertForTokenClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_oOaqmCtijcf" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_tFWz1H1ijcf", + "outputId": "57593fab-a5d5-46a3-8ef9-9934e56b6f02" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.4\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.4\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m540.7/540.7 kB\u001b[0m \u001b[31m38.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "egViPt5dijcf" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YHdsyve1ijcf", + "outputId": "ab76b16a-80e6-46c5-923d-c88d81fcc175" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_i0jnw_fijcf" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `CamemBertForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `CamemBertForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YPdiwbZ_ijcf" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "tokenClassifier = CamemBertForTokenClassification.loadSavedModel(\n", + " f\"{EXPORT_PATH}\",\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ebJdNwT5ijcg" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JMw8tLgZijcg" + }, + "outputs": [], + "source": [ + "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_openvino\".format(EXPORT_PATH))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "em7Gd4kNijcg" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Dqn4EeDnijcg" + }, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JGhE77gBijcg" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your CamemBertForTokenClassification model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "G5M56aTNijcg", + "outputId": "e1ed3b1a-444e-454e-bfaf-b47df4347a78" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 430972\n", + "-rw-r--r-- 1 root root 440489555 Nov 3 19:18 camembert_classification_onnx\n", + "-rw-r--r-- 1 root root 810912 Nov 3 19:18 camembert_spp\n", + "drwxr-xr-x 3 root root 4096 Nov 3 19:18 fields\n", + "drwxr-xr-x 2 root root 4096 Nov 3 19:18 metadata\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_openvino" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D_7AEXz5ijcg" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny CamemBertForTokenClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qlt__GOqijcg" + }, + "outputs": [], + "source": [ + "tokenClassifier_loaded = CamemBertForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(EXPORT_PATH))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y3XfhGEcijcg" + }, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iiC0T624ijcg", + "outputId": "c36615c1-9a59-4d26-d2a7-fc6e600ad5f3" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['I-ORG', 'I-MISC', 'I-LOC', 'I-PER', 'O']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "tokenClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "REIqYCyMijcg" + }, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP πŸš€ pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ybzmSKjMijcg", + "outputId": "043edd1d-579d-4b35-e862-03ca9a7f7913" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| text| result|\n", + "+--------------------+--------------------+\n", + "|Je m'appelle jean...|[O, O, I-PER, O, ...|\n", + "|george washington...|[I-PER, I-PER, O,...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.ml import Pipeline\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " tokenClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"Je m'appelle jean-baptiste et je vis Γ  montrΓ©al\"], ['george washington est allΓ© Γ  washington']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"ner.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1dl_Ju77ijcg" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `CamemBertForTokenClassification` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "285bad4316ae4a4eace151bfce147dc0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_f8daa06df57a43c9afb7f2904ea78fc8", + "IPY_MODEL_bcde4c6901b3424e9ad031e0de66efac", + "IPY_MODEL_b6a7270744f24eb5921733fe8ffedf5b" + ], + "layout": "IPY_MODEL_8d205872f18d443c8016732620c00c83" + } + }, + "f8daa06df57a43c9afb7f2904ea78fc8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_872fb89102eb4bea88b10ad39ce7890e", + "placeholder": "​", + "style": "IPY_MODEL_29c362f1824e495489e9aae12db175c0", + "value": "config.json: 100%" + } + }, + "bcde4c6901b3424e9ad031e0de66efac": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a90c4c4679114f0d8222cff96b225388", + "max": 892, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_bf5d6e2307d9436e8ac041bc6ee4efa1", + "value": 892 + } + }, + "b6a7270744f24eb5921733fe8ffedf5b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52dc6ea8fa0e45c8aaae33ba34eb3238", + "placeholder": "​", + "style": "IPY_MODEL_a4bb2bd411cf41478583c2cc941f6a9b", + "value": " 892/892 [00:00<00:00, 23.6kB/s]" + } + }, + "8d205872f18d443c8016732620c00c83": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "872fb89102eb4bea88b10ad39ce7890e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "29c362f1824e495489e9aae12db175c0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a90c4c4679114f0d8222cff96b225388": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "bf5d6e2307d9436e8ac041bc6ee4efa1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "52dc6ea8fa0e45c8aaae33ba34eb3238": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a4bb2bd411cf41478583c2cc941f6a9b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "bc6c232edff34eb884ddd4550ef22385": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_c8c1a59cada94cf8a1b776c07bfb84d3", + "IPY_MODEL_e78319ac2cbb4c9290d5beabbb18634f", + "IPY_MODEL_23017d56938a4d93874acff6fad3e1ba" + ], + "layout": "IPY_MODEL_415df9ed80af4de8a319ed8d36ffe62b" + } + }, + "c8c1a59cada94cf8a1b776c07bfb84d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_52b6a52a77c4434ea2914445afd825ed", + "placeholder": "​", + "style": "IPY_MODEL_c8b1bb9e628e416ca7c92550ead8fffd", + "value": "model.safetensors: 100%" + } + }, + "e78319ac2cbb4c9290d5beabbb18634f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_725a587f251645c989ef9cc92ff8a9df", + "max": 440168896, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_5d9a35b2a3ca4be9b8338ebfd53e7ee8", + "value": 440168896 + } + }, + "23017d56938a4d93874acff6fad3e1ba": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_25354f5d522547fb82c08ebaa5fc6f63", + "placeholder": "​", + "style": "IPY_MODEL_dbb922ac391245739a057ec33716de5d", + "value": " 440M/440M [00:06<00:00, 76.7MB/s]" + } + }, + "415df9ed80af4de8a319ed8d36ffe62b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "52b6a52a77c4434ea2914445afd825ed": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c8b1bb9e628e416ca7c92550ead8fffd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "725a587f251645c989ef9cc92ff8a9df": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5d9a35b2a3ca4be9b8338ebfd53e7ee8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "25354f5d522547fb82c08ebaa5fc6f63": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dbb922ac391245739a057ec33716de5d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8439f33f9ad44ab3a3e6b062584593e9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4b58681d6eed4d388f673a014c4c8970", + "IPY_MODEL_571a689562df482a9059dd2561ff5b2e", + "IPY_MODEL_3d3d6d4dde2d4586994402a7980083c4" + ], + "layout": "IPY_MODEL_b869e8fa74084ae2b89ee23d81ae6a5e" + } + }, + "4b58681d6eed4d388f673a014c4c8970": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_10cfb3667c5e4bc5bddc1fbd31813204", + "placeholder": "​", + "style": "IPY_MODEL_cad68acf72ca448d81027ee6cb0df531", + "value": "tokenizer_config.json: 100%" + } + }, + "571a689562df482a9059dd2561ff5b2e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_82b70a162ac24e9f8e66b7ca8ac50bb3", + "max": 269, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_459592e9ee7b4ec6942b00c830f8edd3", + "value": 269 + } + }, + "3d3d6d4dde2d4586994402a7980083c4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_4d6b643d3c1341be91c0ac9249c084d6", + "placeholder": "​", + "style": "IPY_MODEL_826f737422e4461fb57f68891696a098", + "value": " 269/269 [00:00<00:00, 392B/s]" + } + }, + "b869e8fa74084ae2b89ee23d81ae6a5e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "10cfb3667c5e4bc5bddc1fbd31813204": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cad68acf72ca448d81027ee6cb0df531": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "82b70a162ac24e9f8e66b7ca8ac50bb3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "459592e9ee7b4ec6942b00c830f8edd3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "4d6b643d3c1341be91c0ac9249c084d6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "826f737422e4461fb57f68891696a098": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fb7beba8d33847fd96ee61a0773ddc83": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7ea5643c28e1449ab427c6d2959f1c01", + "IPY_MODEL_fdc89d2f42884f6ea3c8a87bdb8bb6fb", + "IPY_MODEL_cc0c03d5384444759d214476c610b826" + ], + "layout": "IPY_MODEL_ccd3d08af037460c91c83adb7a618886" + } + }, + "7ea5643c28e1449ab427c6d2959f1c01": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a5899d5737c5465f9bbbeb4eac096739", + "placeholder": "​", + "style": "IPY_MODEL_07f28bbe0fa64e93bd191f8bc389e8c6", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "fdc89d2f42884f6ea3c8a87bdb8bb6fb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_a95ac0dd615a4921a4df9f88d4aef1bd", + "max": 810912, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2601af07921241438c892be192678a5a", + "value": 810912 + } + }, + "cc0c03d5384444759d214476c610b826": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_23a0b6515afb496b9215e07a48d397ea", + "placeholder": "​", + "style": "IPY_MODEL_91be2525e3bf44108588d7ee82d89236", + "value": " 811k/811k [00:00<00:00, 3.85MB/s]" + } + }, + "ccd3d08af037460c91c83adb7a618886": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a5899d5737c5465f9bbbeb4eac096739": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "07f28bbe0fa64e93bd191f8bc389e8c6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "a95ac0dd615a4921a4df9f88d4aef1bd": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2601af07921241438c892be192678a5a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "23a0b6515afb496b9215e07a48d397ea": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "91be2525e3bf44108588d7ee82d89236": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ac2e9328379e43ee9a6dc9427eaab423": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_29dd81e967dd4860a2c9f0a5b8cb5670", + "IPY_MODEL_e558f5a7ddd74dbaa89f1b190d0696f6", + "IPY_MODEL_2d49b0cbc9af4866bd2719234e312c58" + ], + "layout": "IPY_MODEL_d32d0ed8c5b448519f7b211ffd306e5e" + } + }, + "29dd81e967dd4860a2c9f0a5b8cb5670": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ee8401a67e0641ac87df3c16dfc8ca51", + "placeholder": "​", + "style": "IPY_MODEL_5e027f1e752a4642af7a242c1e0d814e", + "value": "special_tokens_map.json: 100%" + } + }, + "e558f5a7ddd74dbaa89f1b190d0696f6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3d13f400299343e29c8ec6ccff83fb08", + "max": 210, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0fa6c310d2c74ca890c8b2c87ada537b", + "value": 210 + } + }, + "2d49b0cbc9af4866bd2719234e312c58": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9412734ce6714680831d4869cae03e54", + "placeholder": "​", + "style": "IPY_MODEL_8b387b73275e4f26ad56c7430f10a35c", + "value": " 210/210 [00:00<00:00, 11.2kB/s]" + } + }, + "d32d0ed8c5b448519f7b211ffd306e5e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee8401a67e0641ac87df3c16dfc8ca51": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "5e027f1e752a4642af7a242c1e0d814e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3d13f400299343e29c8ec6ccff83fb08": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0fa6c310d2c74ca890c8b2c87ada537b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9412734ce6714680831d4869cae03e54": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8b387b73275e4f26ad56c7430f10a35c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala index 3b2d7ef2c7663d..a9929a2ce95352 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala @@ -16,15 +16,17 @@ package com.johnsnowlabs.ml.ai -import ai.onnxruntime.{OnnxTensor, OrtEnvironment} +import ai.onnxruntime.OnnxTensor +import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} +import com.johnsnowlabs.ml.openvino.OpenvinoWrapper import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ -import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} +import org.intel.openvino.Tensor import org.tensorflow.ndarray.buffer.{IntDataBuffer, LongDataBuffer} import org.slf4j.{Logger, LoggerFactory} @@ -44,6 +46,7 @@ import scala.collection.JavaConverters._ private[johnsnowlabs] class CamemBertClassification( val tensorflowWrapper: Option[TensorflowWrapper], val onnxWrapper: Option[OnnxWrapper], + val openvinoWrapper: Option[OpenvinoWrapper], val spp: SentencePieceWrapper, configProtoBytes: Option[Array[Byte]] = None, tags: Map[String, Int], @@ -58,6 +61,7 @@ private[johnsnowlabs] class CamemBertClassification( val detectedEngine: String = if (tensorflowWrapper.isDefined) TensorFlow.name else if (onnxWrapper.isDefined) ONNX.name + else if (openvinoWrapper.isDefined) Openvino.name else TensorFlow.name private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions @@ -98,19 +102,7 @@ private[johnsnowlabs] class CamemBertClassification( def tokenizeSeqString( candidateLabels: Seq[String], maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { - val basicTokenizer = new BasicTokenizer(caseSensitive) - val encoder = - new SentencepieceEncoder(spp, caseSensitive, sentencePieceDelimiterId, pieceIdOffset = 1) - - val labelsToSentences = candidateLabels.map { s => Sentence(s, 0, s.length - 1, 0) } - - labelsToSentences.map(label => { - val tokens = basicTokenizer.tokenize(label) - val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)).take(maxSeqLength) - WordpieceTokenizedSentence(wordpieceTokens) - }) - } + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = ??? def tokenizeDocument( docs: Seq[Annotation], @@ -139,6 +131,7 @@ private[johnsnowlabs] class CamemBertClassification( val rawScores = detectedEngine match { case ONNX.name => getRawScoresWithOnnx(batch) + case Openvino.name => getRawScoresWithOv(batch, maxSentenceLength) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -155,33 +148,31 @@ private[johnsnowlabs] class CamemBertClassification( private def getRawScoresWithTF(batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { val tensors = new TensorResources() -// val (tokenBuffers, maskBuffers) = initializeTFLongTensorResources(batch, tensors, maxSentenceLength) val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length - val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) - val maskBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) // [nb of encoded sentences , maxSentenceLength] val shape = Array(batch.length.toLong, maxSentenceLength) batch.zipWithIndex .foreach { case (sentence, idx) => - val sentenceLong = sentence.map(x => x.toLong) val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentenceLong) + tokenBuffers.offset(offset).write(sentence) maskBuffers .offset(offset) - .write(sentence.map(x => if (x == sentencePadTokenId) 0L else 1L)) + .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) } val runner = tensorflowWrapper.get .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) .runner - val tokenTensors = tensors.createLongBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers) + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) runner .feed( @@ -205,9 +196,52 @@ private[johnsnowlabs] class CamemBertClassification( rawScores } + private def getRawScoresWithOv( + batch: Seq[Array[Int]], + maxSentenceLength: Int + ): Array[Float] = { + + + + val batchLength = batch.length + val shape = Array(batchLength, maxSentenceLength) + val (tokenTensors, maskTensors) = + PrepareEmbeddings.prepareOvLongBatchTensors(batch, maxSentenceLength, batchLength) + + val inferRequest = openvinoWrapper.get.getCompiledModel().create_infer_request() + inferRequest.set_tensor("input_ids", tokenTensors) + inferRequest.set_tensor("attention_mask", maskTensors) + inferRequest.infer() + + try { + try { + inferRequest + .get_tensor("logits") + .data() + } + } catch { + case e: Exception => + // Log the exception as a warning + logger.warn("Exception in getRawScoresWithOnnx", e) + // Rethrow the exception to propagate it further + throw e + } + + } + + private def getRawScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + + // [nb of encoded sentences , maxSentenceLength] val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) - val (tokenTensors, maskTensors) = initializeOnnxTensorResources(batch, env) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + val inputs = Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava @@ -243,7 +277,8 @@ private[johnsnowlabs] class CamemBertClassification( val rawScores = detectedEngine match { case ONNX.name => getRawScoresWithOnnx(batch) - case _ => getRawScoresWithTF(batch, maxSentenceLength) + case Openvino.name => getRawScoresWithOv(batch, maxSentenceLength) + case TensorFlow.name => getRawScoresWithTF(batch, maxSentenceLength) } val dim = rawScores.length / batchLength @@ -265,114 +300,14 @@ private[johnsnowlabs] class CamemBertClassification( batch: Seq[Array[Int]], entailmentId: Int, contradictionId: Int, - activation: String): Array[Array[Float]] = { - - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max - val paddedBatch = batch.map(arr => padArrayWithZeros(arr, maxSentenceLength)) - val batchLength = paddedBatch.length - - val rawScores = detectedEngine match { - case TensorFlow.name => computeZeroShotLogitsWithTF(paddedBatch, maxSentenceLength) - case ONNX.name => computeZeroShotLogitsWithONNX(paddedBatch) - } - - val dim = rawScores.length / batchLength - rawScores - .grouped(dim) - .toArray - } - - def computeZeroShotLogitsWithONNX(batch: Seq[Array[Int]]): Array[Float] = { - val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) - val (tokenTensors, maskTensors) = initializeOnnxTensorResources(batch, env) - val inputs = - Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava - - try { - val results = runner.run(inputs) - try { - val embeddings = results - .get("logits") - .get() - .asInstanceOf[OnnxTensor] - .getFloatBuffer - .array() - tokenTensors.close() - maskTensors.close() - - embeddings - } finally if (results != null) results.close() - } - } - - def computeZeroShotLogitsWithTF( - batch: Seq[Array[Int]], - maxSentenceLength: Int): Array[Float] = { - val tensors = new TensorResources() - val (tokenBuffers, maskBuffers, segmentBuffers) = - initializeTFIntTensorResources(batch, tensors, maxSentenceLength) - // [nb of encoded sentences , maxSentenceLength] - val shape = Array(batch.length.toLong, maxSentenceLength) - - batch.zipWithIndex - .foreach { case (sentence, idx) => - val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentence) - maskBuffers - .offset(offset) - .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) - segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0)) - } - - val runner = tensorflowWrapper.get - .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) - .runner - - val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) - - runner - .feed( - _tfCamemBertSignatures.getOrElse( - ModelSignatureConstants.InputIds.key, - "missing_input_id_key"), - tokenTensors) - .feed( - _tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .feed( - _tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.TokenTypeIds.key, "missing_segment_ids_key"), - segmentTensors) - .fetch(_tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) - - val outs = runner.run().asScala - val rawScores = TensorResources.extractFloats(outs.head) - - outs.foreach(_.close()) - tensors.clearSession(outs) - tensors.clearTensors() - - rawScores - } - - private def padArrayWithZeros(arr: Array[Int], maxLength: Int): Array[Int] = { - if (arr.length >= maxLength) { - arr - } else { - arr ++ Array.fill(maxLength - arr.length)(0) - } - } + activation: String): Array[Array[Float]] = ??? def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { val batchLength = batch.length - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val (startLogits, endLogits) = detectedEngine match { case ONNX.name => computeLogitsWithOnnx(batch) - case TensorFlow.name => computeLogitsWithTF(batch, maxSentenceLength) + case Openvino.name => computeLogitsWithOv(batch) + case _ => computeLogitsWithTF(batch) } val endDim = endLogits.length / batchLength @@ -386,12 +321,14 @@ private[johnsnowlabs] class CamemBertClassification( (startScores, endScores) } - private def computeLogitsWithTF( - batch: Seq[Array[Int]], - maxSentenceLength: Int): (Array[Float], Array[Float]) = { + private def computeLogitsWithTF(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { val tensors = new TensorResources() - val (tokenBuffers, maskBuffers) = - initializeTFLongTensorResources(batch, tensors, maxSentenceLength) + + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val batchLength = batch.length + + val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) + val maskBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) // [nb of encoded sentences , maxSentenceLength] val shape = Array(batch.length.toLong, maxSentenceLength) @@ -439,32 +376,54 @@ private[johnsnowlabs] class CamemBertClassification( (startLogits, endLogits) } - private def initializeTFLongTensorResources( - batch: Seq[Array[Int]], - tensors: TensorResources, - maxSentenceLength: Int): (LongDataBuffer, LongDataBuffer) = { - val batchLength = batch.length - val dim = batchLength * maxSentenceLength - val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(dim) - val maskBuffers: LongDataBuffer = tensors.createLongBuffer(dim) - (tokenBuffers, maskBuffers) - } - private def initializeTFIntTensorResources( - batch: Seq[Array[Int]], - tensors: TensorResources, - maxSentenceLength: Int): (IntDataBuffer, IntDataBuffer, IntDataBuffer) = { + def computeLogitsWithOv( + batch: Seq[Array[Int]] + ): (Array[Float], Array[Float]) = { + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length - val dim = batchLength * maxSentenceLength - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(dim) - val maskBuffers: IntDataBuffer = tensors.createIntBuffer(dim) - val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(dim) - (tokenBuffers, maskBuffers, segmentBuffers) - } + val shape = Array(batchLength, maxSentenceLength) + val (tokenTensors, maskTensors) = + PrepareEmbeddings.prepareOvLongBatchTensors(batch, maxSentenceLength, batchLength) + + val inferRequest = openvinoWrapper.get.getCompiledModel().create_infer_request() + inferRequest.set_tensor("input_ids", tokenTensors) + inferRequest.set_tensor("attention_mask", maskTensors) + inferRequest.infer() + + + try { + try { + val startLogits = inferRequest + .get_tensor("start_logits") + .data() + val endLogits = inferRequest + .get_tensor("end_logits") + .data() + + (startLogits, endLogits) + } + } catch { + case e: Exception => + // Log the exception as a warning + logger.warn("Exception in getRawScoresWithOnnx", e) + // Rethrow the exception to propagate it further + throw e + } + + } private def computeLogitsWithOnnx(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { + // [nb of encoded sentences] val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) - val (tokenTensors, maskTensors) = initializeOnnxTensorResources(batch, env) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + val inputs = Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava @@ -499,17 +458,6 @@ private[johnsnowlabs] class CamemBertClassification( } } - private def initializeOnnxTensorResources(batch: Seq[Array[Int]], env: OrtEnvironment) = { - val tokenTensors = - OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) - val maskTensors = - OnnxTensor.createTensor( - env, - batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) - - (tokenTensors, maskTensors) - } - def findIndexedToken( tokenizedSentences: Seq[TokenizedSentence], sentence: (WordpieceTokenizedSentence, Int), From 2b0a2dc3d93925a6dd984cf1e69ecf27eff3ecd7 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Mon, 2 Sep 2024 07:52:29 +0500 Subject: [PATCH 2/5] adding openvino support to CamembertForXXX --- .../dl/CamemBertForQuestionAnswering.scala | 56 ++++++++++++----- .../CamemBertForSequenceClassification.scala | 54 +++++++++++----- .../dl/CamemBertForTokenClassification.scala | 57 ++++++++++++----- ...amemBertForQuestionAnsweringTestSpec.scala | 62 ++++++++++++++++++- 4 files changed, 177 insertions(+), 52 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala index 4ba692bfc2a906..2b83788b630b87 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnswering.scala @@ -18,18 +18,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai._ import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -118,6 +111,7 @@ class CamemBertForQuestionAnswering(override val uid: String) with HasBatchedAnnotate[CamemBertForQuestionAnswering] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -200,6 +194,7 @@ class CamemBertForQuestionAnswering(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForQuestionAnswering = { if (_model.isEmpty) { _model = Some( @@ -207,6 +202,7 @@ class CamemBertForQuestionAnswering(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = Map.empty[String, Int], @@ -274,7 +270,15 @@ class CamemBertForQuestionAnswering(override val uid: String) spark, getModelIfNotSet.onnxWrapper.get, suffix, - CamemBertForTokenClassification.onnxFile) + CamemBertForQuestionAnswering.onnxFile) + + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForQuestionAnswering.openvinoFile) } writeSentencePieceModel( @@ -312,11 +316,13 @@ trait ReadablePretrainedCamemBertForQAModel trait ReadCamemBertForQADLModel extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel { this: ParamsAndFeaturesReadable[CamemBertForQuestionAnswering] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" + override val openvinoFile: String = "camembert_classification_openvino" override val sppFile: String = "camembert_spp" def readTensorflow( @@ -330,7 +336,7 @@ trait ReadCamemBertForQADLModel case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf", initAllTables = false) - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -340,7 +346,11 @@ trait ReadCamemBertForQADLModel zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_ov") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) case _ => throw new Exception(notSupportedEngineError) } @@ -375,12 +385,24 @@ trait ReadCamemBertForQADLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) annotatorModel - .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + .setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) + case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala index d56b7528abefb5..73bf506d2dd8f4 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForSequenceClassification.scala @@ -18,19 +18,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.CamemBertClassification import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - loadTextAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -126,6 +118,7 @@ class CamemBertForSequenceClassification(override val uid: String) with HasBatchedAnnotate[CamemBertForSequenceClassification] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasClassifierActivationProperties @@ -243,6 +236,7 @@ class CamemBertForSequenceClassification(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForSequenceClassification = { if (_model.isEmpty) { _model = Some( @@ -250,6 +244,7 @@ class CamemBertForSequenceClassification(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -327,6 +322,13 @@ class CamemBertForSequenceClassification(override val uid: String) getModelIfNotSet.onnxWrapper.get, suffix, CamemBertForSequenceClassification.onnxFile) + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForSequenceClassification.openvinoFile) } writeSentencePieceModel( @@ -364,12 +366,14 @@ trait ReadablePretrainedCamemBertForSequenceModel trait ReadCamemBertForSequenceDLModel extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel{ this: ParamsAndFeaturesReadable[CamemBertForSequenceClassification] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" override val sppFile: String = "camembert_spp" + override val openvinoFile: String = "camembert_classification_openvino" def readModel( instance: CamemBertForSequenceClassification, @@ -382,7 +386,7 @@ trait ReadCamemBertForSequenceDLModel case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf", initAllTables = false) - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -392,7 +396,12 @@ trait ReadCamemBertForSequenceDLModel zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_openvino") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) + case _ => throw new Exception(notSupportedEngineError) } @@ -430,12 +439,23 @@ trait ReadCamemBertForSequenceDLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) annotatorModel - .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + .setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala index 5669945561dd79..10f965f8709cff 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForTokenClassification.scala @@ -18,19 +18,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.CamemBertClassification import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - loadTextAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -125,6 +117,7 @@ class CamemBertForTokenClassification(override val uid: String) with HasBatchedAnnotate[CamemBertForTokenClassification] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -221,6 +214,7 @@ class CamemBertForTokenClassification(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForTokenClassification = { if (_model.isEmpty) { _model = Some( @@ -228,6 +222,7 @@ class CamemBertForTokenClassification(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -298,8 +293,17 @@ class CamemBertForTokenClassification(override val uid: String) getModelIfNotSet.onnxWrapper.get, suffix, CamemBertForTokenClassification.onnxFile) + + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForTokenClassification.openvinoFile) } + writeSentencePieceModel( path, spark, @@ -333,11 +337,13 @@ trait ReadablePretrainedCamemBertForTokenModel trait ReadCamemBertForTokenDLModel extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel { this: ParamsAndFeaturesReadable[CamemBertForTokenClassification] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" + override val openvinoFile: String = "camembert_classification_openvino" override val sppFile: String = "camembert_spp" def readModel( @@ -351,7 +357,7 @@ trait ReadCamemBertForTokenDLModel case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf", initAllTables = false) - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -361,7 +367,12 @@ trait ReadCamemBertForTokenDLModel zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_openvino") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) + case _ => throw new Exception(notSupportedEngineError) } @@ -397,12 +408,24 @@ trait ReadCamemBertForTokenDLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) annotatorModel - .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + .setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) + case _ => throw new Exception(notSupportedEngineError) } diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala index 4a35c6c4cb0cea..4aa0249ac08d4b 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForQuestionAnsweringTestSpec.scala @@ -20,7 +20,7 @@ import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.util.io.ResourceHelper import com.johnsnowlabs.tags.SlowTest import com.johnsnowlabs.util.Benchmark -import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.{Pipeline, PipelineModel} import org.scalatest.flatspec.AnyFlatSpec class CamemBertForQuestionAnsweringTestSpec extends AnyFlatSpec { @@ -80,6 +80,66 @@ class CamemBertForQuestionAnsweringTestSpec extends AnyFlatSpec { } + + "CamemBertForQuestionAnswering" should "be saved and loaded correctly" taggedAs SlowTest in { + + import ResourceHelper.spark.implicits._ + + val beyonceContext = + """BeyoncΓ© Giselle Knowles-Carter (/biːˈjΙ’nseΙͺ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of BeyoncΓ©'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".""" + val amazonContext = + """The Amazon rainforest (Portuguese: Floresta AmazΓ΄nica or AmazΓ΄nia; Spanish: Selva AmazΓ³nica, AmazonΓ­a or usually Amazonia; French: ForΓͺt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain "Amazonas" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.""" + + val ddd = Seq( + ( + "Where was John Lenon born?", + "John Lenon was born in London and lived in Paris. My name is Sarah and I live in London."), + ("What's my name?", "My name is Clara and I live in Berkeley."), + ("Which name is also used to describe the Amazon rainforest in English?", amazonContext), + ("When did Beyonce start becoming popular?", beyonceContext), + ("What areas did Beyonce compete in when she was growing up?", beyonceContext), + ("When did Beyonce leave Destiny's Child and become a solo singer?", beyonceContext), + ("What was the first album BeyoncΓ© released as a solo artist?", beyonceContext)) + .toDF("question", "context") + .repartition(1) + + val document = new MultiDocumentAssembler() + .setInputCols("question", "context") + .setOutputCols("document_question", "document_context") + + val questionAnswering = CamemBertForQuestionAnswering + .pretrained() + .setInputCols(Array("document_question", "document_context")) + .setOutputCol("answer") + .setCaseSensitive(false) + .setMaxSentenceLength(512) + + + val pipeline = new Pipeline().setStages(Array(document, questionAnswering)) + + val pipelineModel = pipeline.fit(ddd) + val pipelineDF = pipelineModel.transform(ddd) + + pipelineDF.select("answer.result").show(false) + + Benchmark.time("Time to save CamemBertForQuestionAnswering pipeline model") { + pipelineModel.write.overwrite().save("./tmp_forquestionanswering_pipeline") + } + + Benchmark.time("Time to save CamemBertForQuestionAnswering model") { + pipelineModel.stages.last + .asInstanceOf[CamemBertForQuestionAnswering] + .write + .overwrite() + .save("./tmp_forquestionanswering_model") + } + + val loadedPipelineModel = PipelineModel.load("./tmp_forquestionanswering_pipeline") + loadedPipelineModel.transform(ddd).select("answer.result").show(false) + + val loadedSequenceModel = CamemBertForQuestionAnswering.load("./tmp_forquestionanswering_model") + + } "CamemBertForQuestionAnswering" should "benchmark test" taggedAs SlowTest in { val data = ResourceHelper.spark.read From 85931facfd3909ea1b95b97ab5e29261927ba9ba Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Mon, 2 Sep 2024 23:02:15 +0500 Subject: [PATCH 3/5] Update CamemBertClassification.scala --- .../ml/ai/CamemBertClassification.scala | 399 ++++++++++++------ 1 file changed, 264 insertions(+), 135 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala index a9929a2ce95352..56002366214529 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala @@ -16,7 +16,7 @@ package com.johnsnowlabs.ml.ai -import ai.onnxruntime.OnnxTensor +import ai.onnxruntime.{OnnxTensor, OrtEnvironment} import com.johnsnowlabs.ml.ai.util.PrepareEmbeddings import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} import com.johnsnowlabs.ml.openvino.OpenvinoWrapper @@ -25,34 +25,34 @@ import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignat import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ +import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} -import org.intel.openvino.Tensor import org.tensorflow.ndarray.buffer.{IntDataBuffer, LongDataBuffer} import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ /** @param tensorflowWrapper - * CamemBERT Model wrapper with TensorFlow Wrapper - * @param spp - * XlmRoberta SentencePiece model with SentencePieceWrapper - * @param configProtoBytes - * Configuration for TensorFlow session - * @param tags - * labels which model was trained with in order - * @param signatures - * TF v2 signatures in Spark NLP - */ + * CamemBERT Model wrapper with TensorFlow Wrapper + * @param spp + * XlmRoberta SentencePiece model with SentencePieceWrapper + * @param configProtoBytes + * Configuration for TensorFlow session + * @param tags + * labels which model was trained with in order + * @param signatures + * TF v2 signatures in Spark NLP + */ private[johnsnowlabs] class CamemBertClassification( - val tensorflowWrapper: Option[TensorflowWrapper], - val onnxWrapper: Option[OnnxWrapper], - val openvinoWrapper: Option[OpenvinoWrapper], - val spp: SentencePieceWrapper, - configProtoBytes: Option[Array[Byte]] = None, - tags: Map[String, Int], - signatures: Option[Map[String, String]] = None, - threshold: Float = 0.5f) - extends Serializable + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], + val openvinoWrapper: Option[OpenvinoWrapper], + val spp: SentencePieceWrapper, + configProtoBytes: Option[Array[Byte]] = None, + tags: Map[String, Int], + signatures: Option[Map[String, String]] = None, + threshold: Float = 0.5f) + extends Serializable with XXXForClassification { protected val logger: Logger = LoggerFactory.getLogger("CamemBertClassification") @@ -66,9 +66,9 @@ private[johnsnowlabs] class CamemBertClassification( private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions /** HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated - * in the actual # sentencepiece vocabulary (this is the case for '''''' and '''''') - * '''NOTUSED": 0''','''"": 1''', '''"NOTUSED": 2''', '''"": 3''' - */ + * in the actual # sentencepiece vocabulary (this is the case for '''''' and '''''') + * '''NOTUSED": 0''','''"": 1''', '''"NOTUSED": 2''', '''"": 3''' + */ private val pieceIdOffset: Int = 4 protected val sentenceStartTokenId: Int = spp.getSppModel.pieceToId("") + pieceIdOffset protected val sentenceEndTokenId: Int = spp.getSppModel.pieceToId("") + pieceIdOffset @@ -79,9 +79,9 @@ private[johnsnowlabs] class CamemBertClassification( protected val sigmoidThreshold: Float = threshold def tokenizeWithAlignment( - sentences: Seq[TokenizedSentence], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + sentences: Seq[TokenizedSentence], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { val encoder = new SentencepieceEncoder( @@ -100,14 +100,26 @@ private[johnsnowlabs] class CamemBertClassification( } def tokenizeSeqString( - candidateLabels: Seq[String], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = ??? + candidateLabels: Seq[String], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + val basicTokenizer = new BasicTokenizer(caseSensitive) + val encoder = + new SentencepieceEncoder(spp, caseSensitive, sentencePieceDelimiterId, pieceIdOffset = 1) + + val labelsToSentences = candidateLabels.map { s => Sentence(s, 0, s.length - 1, 0) } + + labelsToSentences.map(label => { + val tokens = basicTokenizer.tokenize(label) + val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)).take(maxSeqLength) + WordpieceTokenizedSentence(wordpieceTokens) + }) + } def tokenizeDocument( - docs: Seq[Annotation], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + docs: Seq[Annotation], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { val encoder = new SentencepieceEncoder( @@ -148,31 +160,33 @@ private[johnsnowlabs] class CamemBertClassification( private def getRawScoresWithTF(batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { val tensors = new TensorResources() + // val (tokenBuffers, maskBuffers) = initializeTFLongTensorResources(batch, tensors, maxSentenceLength) val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) + val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) + val maskBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) // [nb of encoded sentences , maxSentenceLength] val shape = Array(batch.length.toLong, maxSentenceLength) batch.zipWithIndex .foreach { case (sentence, idx) => + val sentenceLong = sentence.map(x => x.toLong) val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentence) + tokenBuffers.offset(offset).write(sentenceLong) maskBuffers .offset(offset) - .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) + .write(sentence.map(x => if (x == sentencePadTokenId) 0L else 1L)) } val runner = tensorflowWrapper.get .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) .runner - val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) + val tokenTensors = tensors.createLongBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers) runner .feed( @@ -196,6 +210,37 @@ private[johnsnowlabs] class CamemBertClassification( rawScores } + private def getRawScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + val (tokenTensors, maskTensors) = initializeOnnxTensorResources(batch, env) + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + embeddings + } finally if (results != null) results.close() + } catch { + case e: Exception => + // Handle exceptions by logging or other means. + e.printStackTrace() + Array.empty[Float] // Return an empty array or appropriate error handling + } finally { + // Close tensors outside the try-catch to avoid repeated null checks. + // These resources are initialized before the try-catch, so they should be closed here. + tokenTensors.close() + maskTensors.close() + } + } + private def getRawScoresWithOv( batch: Seq[Array[Int]], maxSentenceLength: Int @@ -230,46 +275,6 @@ private[johnsnowlabs] class CamemBertClassification( } - private def getRawScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { - - // [nb of encoded sentences , maxSentenceLength] - val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) - - val tokenTensors = - OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) - val maskTensors = - OnnxTensor.createTensor( - env, - batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) - - val inputs = - Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava - - try { - val results = runner.run(inputs) - try { - val embeddings = results - .get("logits") - .get() - .asInstanceOf[OnnxTensor] - .getFloatBuffer - .array() - - embeddings - } finally if (results != null) results.close() - } catch { - case e: Exception => - // Handle exceptions by logging or other means. - e.printStackTrace() - Array.empty[Float] // Return an empty array or appropriate error handling - } finally { - // Close tensors outside the try-catch to avoid repeated null checks. - // These resources are initialized before the try-catch, so they should be closed here. - tokenTensors.close() - maskTensors.close() - } - } - def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max @@ -278,7 +283,7 @@ private[johnsnowlabs] class CamemBertClassification( val rawScores = detectedEngine match { case ONNX.name => getRawScoresWithOnnx(batch) case Openvino.name => getRawScoresWithOv(batch, maxSentenceLength) - case TensorFlow.name => getRawScoresWithTF(batch, maxSentenceLength) + case _ => getRawScoresWithTF(batch, maxSentenceLength) } val dim = rawScores.length / batchLength @@ -297,89 +302,114 @@ private[johnsnowlabs] class CamemBertClassification( } def tagZeroShotSequence( - batch: Seq[Array[Int]], - entailmentId: Int, - contradictionId: Int, - activation: String): Array[Array[Float]] = ??? - - def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { - val batchLength = batch.length - val (startLogits, endLogits) = detectedEngine match { - case ONNX.name => computeLogitsWithOnnx(batch) - case Openvino.name => computeLogitsWithOv(batch) - case _ => computeLogitsWithTF(batch) - } + batch: Seq[Array[Int]], + entailmentId: Int, + contradictionId: Int, + activation: String): Array[Array[Float]] = { - val endDim = endLogits.length / batchLength - val endScores: Array[Array[Float]] = - endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val paddedBatch = batch.map(arr => padArrayWithZeros(arr, maxSentenceLength)) + val batchLength = paddedBatch.length - val startDim = startLogits.length / batchLength - val startScores: Array[Array[Float]] = - startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + val rawScores = detectedEngine match { + case TensorFlow.name => computeZeroShotLogitsWithTF(paddedBatch, maxSentenceLength) + case ONNX.name => computeZeroShotLogitsWithONNX(paddedBatch) + } - (startScores, endScores) + val dim = rawScores.length / batchLength + rawScores + .grouped(dim) + .toArray } - private def computeLogitsWithTF(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { - val tensors = new TensorResources() + def computeZeroShotLogitsWithONNX(batch: Seq[Array[Int]]): Array[Float] = { + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + val (tokenTensors, maskTensors) = initializeOnnxTensorResources(batch, env) + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max - val batchLength = batch.length + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() - val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) - val maskBuffers: LongDataBuffer = tensors.createLongBuffer(batchLength * maxSentenceLength) + embeddings + } finally if (results != null) results.close() + } + } + def computeZeroShotLogitsWithTF( + batch: Seq[Array[Int]], + maxSentenceLength: Int): Array[Float] = { + val tensors = new TensorResources() + val (tokenBuffers, maskBuffers, segmentBuffers) = + initializeTFIntTensorResources(batch, tensors, maxSentenceLength) // [nb of encoded sentences , maxSentenceLength] val shape = Array(batch.length.toLong, maxSentenceLength) - // [nb of encoded sentences , maxSentenceLength] batch.zipWithIndex .foreach { case (sentence, idx) => - val sentenceLong = sentence.map(x => x.toLong) val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentenceLong) + tokenBuffers.offset(offset).write(sentence) maskBuffers .offset(offset) - .write(sentence.map(x => if (x == sentencePadTokenId) 0L else 1L)) + .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) + segmentBuffers.offset(offset).write(Array.fill(maxSentenceLength)(0)) } val runner = tensorflowWrapper.get .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) .runner - val tokenTensors = tensors.createLongBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers) + val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) + val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) runner .feed( - _tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), + _tfCamemBertSignatures.getOrElse( + ModelSignatureConstants.InputIds.key, + "missing_input_id_key"), tokenTensors) .feed( _tfCamemBertSignatures .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), maskTensors) + .feed( + _tfCamemBertSignatures + .getOrElse(ModelSignatureConstants.TokenTypeIds.key, "missing_segment_ids_key"), + segmentTensors) .fetch(_tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.EndLogitsOutput.key, "missing_end_logits_key")) - .fetch(_tfCamemBertSignatures - .getOrElse(ModelSignatureConstants.StartLogitsOutput.key, "missing_start_logits_key")) + .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) val outs = runner.run().asScala - val endLogits = TensorResources.extractFloats(outs.head) - val startLogits = TensorResources.extractFloats(outs.last) + val rawScores = TensorResources.extractFloats(outs.head) outs.foreach(_.close()) tensors.clearSession(outs) tensors.clearTensors() - (startLogits, endLogits) + rawScores } + private def padArrayWithZeros(arr: Array[Int], maxLength: Int): Array[Int] = { + if (arr.length >= maxLength) { + arr + } else { + arr ++ Array.fill(maxLength - arr.length)(0) + } + } def computeLogitsWithOv( - batch: Seq[Array[Int]] - ): (Array[Float], Array[Float]) = { + batch: Seq[Array[Int]] + ): (Array[Float], Array[Float]) = { val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length val shape = Array(batchLength, maxSentenceLength) @@ -392,7 +422,6 @@ private[johnsnowlabs] class CamemBertClassification( inferRequest.infer() - try { try { val startLogits = inferRequest @@ -413,17 +442,106 @@ private[johnsnowlabs] class CamemBertClassification( } } - private def computeLogitsWithOnnx(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { - // [nb of encoded sentences] - val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) - val tokenTensors = - OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) - val maskTensors = - OnnxTensor.createTensor( - env, - batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { + val batchLength = batch.length + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val (startLogits, endLogits) = detectedEngine match { + case ONNX.name => computeLogitsWithOnnx(batch) + case Openvino.name => computeLogitsWithOv(batch) + case TensorFlow.name => computeLogitsWithTF(batch, maxSentenceLength) + } + + val endDim = endLogits.length / batchLength + val endScores: Array[Array[Float]] = + endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + val startDim = startLogits.length / batchLength + val startScores: Array[Array[Float]] = + startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + + (startScores, endScores) + } + + private def computeLogitsWithTF( + batch: Seq[Array[Int]], + maxSentenceLength: Int): (Array[Float], Array[Float]) = { + val tensors = new TensorResources() + val (tokenBuffers, maskBuffers) = + initializeTFLongTensorResources(batch, tensors, maxSentenceLength) + + // [nb of encoded sentences , maxSentenceLength] + val shape = Array(batch.length.toLong, maxSentenceLength) + + // [nb of encoded sentences , maxSentenceLength] + batch.zipWithIndex + .foreach { case (sentence, idx) => + val sentenceLong = sentence.map(x => x.toLong) + val offset = idx * maxSentenceLength + tokenBuffers.offset(offset).write(sentenceLong) + maskBuffers + .offset(offset) + .write(sentence.map(x => if (x == sentencePadTokenId) 0L else 1L)) + } + + val runner = tensorflowWrapper.get + .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) + .runner + + val tokenTensors = tensors.createLongBufferTensor(shape, tokenBuffers) + val maskTensors = tensors.createLongBufferTensor(shape, maskBuffers) + + runner + .feed( + _tfCamemBertSignatures + .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), + tokenTensors) + .feed( + _tfCamemBertSignatures + .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), + maskTensors) + .fetch(_tfCamemBertSignatures + .getOrElse(ModelSignatureConstants.EndLogitsOutput.key, "missing_end_logits_key")) + .fetch(_tfCamemBertSignatures + .getOrElse(ModelSignatureConstants.StartLogitsOutput.key, "missing_start_logits_key")) + + val outs = runner.run().asScala + val endLogits = TensorResources.extractFloats(outs.head) + val startLogits = TensorResources.extractFloats(outs.last) + + outs.foreach(_.close()) + tensors.clearSession(outs) + tensors.clearTensors() + + (startLogits, endLogits) + } + + private def initializeTFLongTensorResources( + batch: Seq[Array[Int]], + tensors: TensorResources, + maxSentenceLength: Int): (LongDataBuffer, LongDataBuffer) = { + val batchLength = batch.length + val dim = batchLength * maxSentenceLength + val tokenBuffers: LongDataBuffer = tensors.createLongBuffer(dim) + val maskBuffers: LongDataBuffer = tensors.createLongBuffer(dim) + (tokenBuffers, maskBuffers) + } + + private def initializeTFIntTensorResources( + batch: Seq[Array[Int]], + tensors: TensorResources, + maxSentenceLength: Int): (IntDataBuffer, IntDataBuffer, IntDataBuffer) = { + val batchLength = batch.length + val dim = batchLength * maxSentenceLength + val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(dim) + val maskBuffers: IntDataBuffer = tensors.createIntBuffer(dim) + val segmentBuffers: IntDataBuffer = tensors.createIntBuffer(dim) + (tokenBuffers, maskBuffers, segmentBuffers) + } + + private def computeLogitsWithOnnx(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + val (tokenTensors, maskTensors) = initializeOnnxTensorResources(batch, env) val inputs = Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava @@ -458,12 +576,23 @@ private[johnsnowlabs] class CamemBertClassification( } } + private def initializeOnnxTensorResources(batch: Seq[Array[Int]], env: OrtEnvironment) = { + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + (tokenTensors, maskTensors) + } + def findIndexedToken( - tokenizedSentences: Seq[TokenizedSentence], - sentence: (WordpieceTokenizedSentence, Int), - tokenPiece: TokenPiece): Option[IndexedToken] = { + tokenizedSentences: Seq[TokenizedSentence], + sentence: (WordpieceTokenizedSentence, Int), + tokenPiece: TokenPiece): Option[IndexedToken] = { tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin && tokenPiece.isWordStart) } -} +} \ No newline at end of file From 10f1fe3d615a7bbf26430e6ef759368d9e06b3a6 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Mon, 9 Sep 2024 13:42:02 +0500 Subject: [PATCH 4/5] fixed syntax --- .../com/johnsnowlabs/ml/ai/CamemBertClassification.scala | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala index 86a68aa208286f..fdc5919f1ef88d 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala @@ -350,10 +350,6 @@ private[johnsnowlabs] class CamemBertClassification( batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { - - batch: Seq[Array[Int]], - maxSentenceLength: Int): Array[Float] = { - val tensors = new TensorResources() val (tokenBuffers, maskBuffers, segmentBuffers) = initializeTFIntTensorResources(batch, tensors, maxSentenceLength) @@ -607,5 +603,4 @@ private[johnsnowlabs] class CamemBertClassification( tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin && tokenPiece.isWordStart) } - } \ No newline at end of file From 35d5d80558b9370a56736bfc514746eaaed6ba62 Mon Sep 17 00:00:00 2001 From: ahmedlone127 Date: Tue, 10 Sep 2024 09:12:56 +0500 Subject: [PATCH 5/5] Update CamemBertForZeroShotClassification.scala --- .../CamemBertForZeroShotClassification.scala | 46 ++++++++++++++++--- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala index 4a5bcde0e87ef1..f66b297d7fc132 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/CamemBertForZeroShotClassification.scala @@ -18,6 +18,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.CamemBertClassification import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} +import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel} import com.johnsnowlabs.ml.tensorflow.{ ReadTensorflowModel, TensorflowWrapper, @@ -34,7 +35,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{ modelSanityCheck, notSupportedEngineError } -import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} +import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow} import com.johnsnowlabs.nlp.annotators.common.{SentenceSplit, TokenizedWithSentence} import com.johnsnowlabs.nlp.serialization.MapFeature import com.johnsnowlabs.nlp.{ @@ -59,6 +60,7 @@ class CamemBertForZeroShotClassification(override val uid: String) with HasBatchedAnnotate[CamemBertForZeroShotClassification] with WriteTensorflowModel with WriteOnnxModel + with WriteOpenvinoModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasClassifierActivationProperties @@ -178,6 +180,7 @@ class CamemBertForZeroShotClassification(override val uid: String) spark: SparkSession, tensorflowWrapper: Option[TensorflowWrapper], onnxWrapper: Option[OnnxWrapper], + openvinoWrapper: Option[OpenvinoWrapper], spp: SentencePieceWrapper): CamemBertForZeroShotClassification = { if (_model.isEmpty) { _model = Some( @@ -185,6 +188,7 @@ class CamemBertForZeroShotClassification(override val uid: String) new CamemBertClassification( tensorflowWrapper, onnxWrapper, + openvinoWrapper, spp, configProtoBytes = None, tags = $$(labels), @@ -269,6 +273,15 @@ class CamemBertForZeroShotClassification(override val uid: String) getModelIfNotSet.onnxWrapper.get, suffix, CamemBertForSequenceClassification.onnxFile) + + case Openvino.name => + writeOpenvinoModel( + path, + spark, + getModelIfNotSet.openvinoWrapper.get, + "openvino_model.xml", + CamemBertForSequenceClassification.openvinoFile) + } writeSentencePieceModel( @@ -305,11 +318,13 @@ trait ReadPretrainedCamemBertForZeroShotClassification trait ReadCamemBertForZeroShotClassification extends ReadTensorflowModel with ReadOnnxModel - with ReadSentencePieceModel { + with ReadSentencePieceModel + with ReadOpenvinoModel { this: ParamsAndFeaturesReadable[CamemBertForZeroShotClassification] => override val tfFile: String = "camembert_classification_tensorflow" override val onnxFile: String = "camembert_classification_onnx" + override val openvinoFile: String = "camembert_classification_openvino" override val sppFile: String = "camembert_spp" def readModel( @@ -322,7 +337,7 @@ trait ReadCamemBertForZeroShotClassification instance.getEngine match { case TensorFlow.name => val tfWrapper = readTensorflowModel(path, spark, "_camembert_classification_tf") - instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, None, spp) case ONNX.name => val onnxWrapper = readOnnxModel( @@ -332,11 +347,16 @@ trait ReadCamemBertForZeroShotClassification zipped = true, useBundle = false, None) - instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spp) + + case Openvino.name => + val openvinoWrapper = readOpenvinoModel(path, spark, "_camembert_classification_ov") + instance.setModelIfNotSet(spark, None, None, Some(openvinoWrapper), spp) + case _ => throw new Exception(notSupportedEngineError) - } + } } addReader(readModel) @@ -392,11 +412,23 @@ trait ReadCamemBertForZeroShotClassification */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, Some(wrapper), None, spModel) + .setModelIfNotSet(spark, Some(wrapper), None, None, spModel) case ONNX.name => val onnxWrapper = OnnxWrapper.read(spark, localModelPath, zipped = false, useBundle = true) - annotatorModel.setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) + annotatorModel.setModelIfNotSet(spark, None, Some(onnxWrapper), None, spModel) + + case Openvino.name => + val ovWrapper: OpenvinoWrapper = + OpenvinoWrapper.read( + spark, + localModelPath, + zipped = false, + useBundle = true, + detectedEngine = detectedEngine) + annotatorModel + .setModelIfNotSet(spark, None, None, Some(ovWrapper), spModel) + case _ => throw new Exception(notSupportedEngineError) }