diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb new file mode 100644 index 00000000000000..5f3a6e2d16d03d --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb @@ -0,0 +1,2433 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "_opj2ZzntbDk" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForQuestionAnswering.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u1i6TpsutbDl" + }, + "source": [ + "## Import ONNX XlmRoBertaForQuestionAnswering models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `XlmRoBertaForQuestionAnswering` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import XLM-RoBERTa models trained/fine-tuned for question answering via `XlmRoBertaForQuestionAnswering` or `TFXlmRoBertaForQuestionAnswering`. These models are usually under `Question Answering` category and have `xlm-roberta` in their labels\n", + "- Reference: [TFXlmRoBertaForQuestionAnswering](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForQuestionAnswering)\n", + "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=question-answering)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tikYI59NtbDl" + }, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fHfLHo2CtbDl" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully.\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "BtaSwj8mtbDl", + "outputId": "42f0e775-573e-4260-a696-48a25bedc212", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m29.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m43.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m63.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m43.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m70.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m69.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m84.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m87.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m37.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m87.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m51.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m93.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m77.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m81.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m42.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m76.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m58.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m83.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m40.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m68.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m94.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m68.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m87.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m74.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m85.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m36.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m75.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m51.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m84.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m82.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m74.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m56.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m90.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m32.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m89.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m40.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m38.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m79.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m29.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m68.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m75.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oOUudG_-tbDm" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use ['deepset/xlm-roberta-base-squad2'](https://huggingface.co/'deepset/xlm-roberta-base-squad2') model from HuggingFace as an example as an example and load it as a `ORTModelForQuestionAnswering`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "0MbeB8jutbDm", + "outputId": "f0f57344-6fc4-4d0b-a019-f05ae56b368c", + "colab": { + "referenced_widgets": [ + "b8d926231122407f95b4483350bc4e8e", + "4154781b221948aab4258b4fa6799996", + "2d9727f78a41430890ec77cce4fe0ce5", + "17ca1b81af0f408e9ab164456872cd49", + "6885319026334ff99533e70c8670baea", + "6566341fd0c04880a7ab5ff1409e4448", + "4119ae23b5bc4a0c9148e60eb9dfcb53", + "88166d6e4ee04f11a79c2a1a532c7300", + "130c539288884b3aa341f9be6c62d29a", + "032d809a9bb64c9b863f6f4b7b115133", + "c9bb2353da02443c94afd06685b8cde8", + "7f808235cbfd4ec28e05215cbd27e3f8", + "10e1aa374ebd43a9b083d9b7aab95b23", + "a5844989c8f648df82ee6414eeec9a21", + "d02f4f39378b4e568f0f9382efd6a9d1", + "76f48ff667754dd682ade56eaaca7049", + "f251ac2cd1ed45d4a7d25d6b906a84bc", + "824bc6d195ba465497fe34898880a7f8", + "05d85f70379b486db128698df001384e", + "7c620665c5964164adaf3539a0cb5ff5", + "f708d4e670c54cfe93d9ba82818590dc", + "0be61c031fad4f31877ad29119b6a77e", + "54210209428948189c549c1d2cd939ca", + "e77456b2ef2a46ef99572a02e1b70817", + "3fbcd89672f842bf9692ebffd72d26c7", + "f87349ba371f41d788f802a64f9312f9", + "c217c65b12264609862aeaeb1c25adc0", + "2bef4b3dc3cd457fa4ccb2e796d771f7", + "240bdca5cc794ae8b1319310e42aec2c", + "389d3448663f4389afa71734e6cc1434", + "937847a9252247979ed76e17dc60f7b6", + "cb0069caad574477866382fa085508d5", + "1f3d6a7de82f4710a79d814b6af57679", + "5473ab4a957b414aa344f37f921d89a2", + "423cf810d1254518af9e4fc79f97e54e", + "83c2346cea5c4928a23f4fa38d2ffe40", + "e2fde33a78ff4172b806b1530bda4e9a", + "d1b3fb0032ce400c80922d128940333c", + "96b57a70103f4ac9b014b375e1aee824", + "74b28f16de69498b9915e501863b4d73", + "8338b89888584db5944d0ad5baa9118f", + "8baaa8a868ec4e0b98388e565d61a3db", + "9c1d4b12b247412295fe57637f7d2e60", + "ac6ad6631b054107b3287180cf3b9e68", + "8c36c9eda216474d86fe73c161690a6d", + "14d951a930d5478b9f4e77c430c2bb6f", + "f6ae4d642a9f41cd829dde04347e50a3", + "969548659ca74527afa6a0f92e3a98e7", + "7f90a349f8ce4525aa33649d70c09a33", + "80fb8200d6d74cbca19ce946d73d9efa", + "67688fa958004ccfbacf0288f4075dea", + "fa8d4ffdee2d454cb14941089ff881d6", + "2577bab4d4a64010addd7c2b5370a6c8", + "01f9a013ab38435191872a8bce64dd69", + "0d91f57bfa25489eb70a23f2bd834cdf" + ], + "base_uri": "https://localhost:8080/", + "height": 405 + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/605 [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForQuestionAnswering\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'deepset/xlm-roberta-base-squad2'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForQuestionAnswering.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CKGIfc8ltbDn" + }, + "source": [ + "Let's have a look inside this directory and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "eoyOGvXftbDn", + "outputId": "3d061001-3161-4594-f681-e88211f4e796", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 1105736\n", + "-rw-r--r-- 1 root root 787 Jan 9 19:44 config.json\n", + "-rw-r--r-- 1 root root 1110100056 Jan 9 19:44 model.onnx\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:44 sentencepiece.bpe.model\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:44 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 500 Jan 9 19:44 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:44 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "blR5qjXwtbDn" + }, + "source": [ + "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "QEOomAeKtbDn" + }, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "5wjZ8w19tbDn" + }, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ehhwZp5ntbDn" + }, + "source": [ + "Voila! We have our `sentencepiece.bpe.model` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "s7B5nkQ7tbDn", + "outputId": "d5a9f508-f04c-4281-b99e-a74ce6c8c153", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "onnx_models/deepset/xlm-roberta-base-squad2:\n", + "total 1100788\n", + "drwxr-xr-x 2 root root 4096 Jan 9 19:44 assets\n", + "-rw-r--r-- 1 root root 787 Jan 9 19:44 config.json\n", + "-rw-r--r-- 1 root root 1110100056 Jan 9 19:44 model.onnx\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:44 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 500 Jan 9 19:44 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:44 tokenizer.json\n", + "\n", + "onnx_models/deepset/xlm-roberta-base-squad2/assets:\n", + "total 4952\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:44 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bVKi9X6ftbDn" + }, + "source": [ + "## Import and Save RoBertaForQuestionAnswering in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "be7jTIVAtbDo" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "R_YAIBS_tbDo", + "outputId": "7506fbe6-aa72-4697-ae19-5ab7ae404f18" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m40.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m23.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5MD6ogjatbDo" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kcTBCppJtbDo", + "outputId": "379c7e82-9918-4294-b20b-d0c45215febf" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "k3S-0O9btbDo" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `RoBertaForQuestionAnswering` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `RoBertaForQuestionAnswering` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Gsnk6JQ7tbDo" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "spanClassifier = RoBertaForQuestionAnswering.loadSavedModel(\n", + " ONNX_MODEL,\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(512)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3ed2WScitbDo" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Gy7YzF0htbDo" + }, + "outputs": [], + "source": [ + "spanClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1p0HFM4atbDo" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RvkyiLHotbDo" + }, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xiNxN0tdtbDo" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your RoBertaForQuestionAnswering model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Kzym6Y90tbDo", + "outputId": "b3f2deb2-be48-4eac-e747-472ec58d6873" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 484956\n", + "drwxr-xr-x 4 root root 4096 Oct 17 16:49 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 17 16:49 metadata\n", + "-rw-r--r-- 1 root root 496583922 Oct 17 16:49 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m2NiO3hytbDo" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForQuestionAnswering model in Spark NLP πŸš€ pipeline!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BmIIrsGctbDp", + "outputId": "44f84743-6908-4143-87b6-244aae258115" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------+\n", + "|result |\n", + "+---------------------------+\n", + "|[as Amazonia or the Amazon]|\n", + "+---------------------------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = MultiDocumentAssembler() \\\n", + " .setInputCols([\"question\", \"context\"]) \\\n", + " .setOutputCols([\"document_question\", \"document_context\"])\n", + "\n", + "spanClassifier_loaded = RoBertaForQuestionAnswering.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document_question\",'document_context'])\\\n", + " .setOutputCol(\"answer\")\n", + "\n", + "pipeline = Pipeline().setStages([\n", + " document_assembler,\n", + " spanClassifier_loaded\n", + "])\n", + "\n", + "context = \"\"\"The Amazon rainforest (Portuguese: Floresta AmazΓ΄nica or AmazΓ΄nia; Spanish: Selva AmazΓ³nica, AmazonΓ­a or usually Amazonia; French: ForΓͺt amazonienne; Dutch: Amazoneregenwoud), also known in English as Amazonia or the Amazon Jungle, is a moist broadleaf forest that covers most of the Amazon basin of South America. This basin encompasses 7,000,000 square kilometres (2,700,000 sq mi), of which 5,500,000 square kilometres (2,100,000 sq mi) are covered by the rainforest. This region includes territory belonging to nine nations. The majority of the forest is contained within Brazil, with 60% of the rainforest, followed by Peru with 13%, Colombia with 10%, and with minor amounts in Venezuela, Ecuador, Bolivia, Guyana, Suriname and French Guiana. States or departments in four nations contain \"Amazonas\" in their names. The Amazon represents over half of the planet's remaining rainforests, and comprises the largest and most biodiverse tract of tropical rainforest in the world, with an estimated 390 billion individual trees divided into 16,000 species.\"\"\"\n", + "question = \"Which name is also used to describe the Amazon rainforest in English?\"\n", + "example = spark.createDataFrame([[question, context]]).toDF(\"question\", \"context\")\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "result.select(\"answer.result\").show(1, False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M5L0cHZptbDp" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `RoBertaForQuestionAnswering` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "b8d926231122407f95b4483350bc4e8e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_4154781b221948aab4258b4fa6799996", + "IPY_MODEL_2d9727f78a41430890ec77cce4fe0ce5", + "IPY_MODEL_17ca1b81af0f408e9ab164456872cd49" + ], + "layout": "IPY_MODEL_6885319026334ff99533e70c8670baea" + } + }, + "4154781b221948aab4258b4fa6799996": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_6566341fd0c04880a7ab5ff1409e4448", + "placeholder": "​", + "style": "IPY_MODEL_4119ae23b5bc4a0c9148e60eb9dfcb53", + "value": "config.json: 100%" + } + }, + "2d9727f78a41430890ec77cce4fe0ce5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_88166d6e4ee04f11a79c2a1a532c7300", + "max": 605, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_130c539288884b3aa341f9be6c62d29a", + "value": 605 + } + }, + "17ca1b81af0f408e9ab164456872cd49": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_032d809a9bb64c9b863f6f4b7b115133", + "placeholder": "​", + "style": "IPY_MODEL_c9bb2353da02443c94afd06685b8cde8", + "value": " 605/605 [00:00<00:00, 24.9kB/s]" + } + }, + "6885319026334ff99533e70c8670baea": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "6566341fd0c04880a7ab5ff1409e4448": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4119ae23b5bc4a0c9148e60eb9dfcb53": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "88166d6e4ee04f11a79c2a1a532c7300": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "130c539288884b3aa341f9be6c62d29a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "032d809a9bb64c9b863f6f4b7b115133": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "c9bb2353da02443c94afd06685b8cde8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7f808235cbfd4ec28e05215cbd27e3f8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_10e1aa374ebd43a9b083d9b7aab95b23", + "IPY_MODEL_a5844989c8f648df82ee6414eeec9a21", + "IPY_MODEL_d02f4f39378b4e568f0f9382efd6a9d1" + ], + "layout": "IPY_MODEL_76f48ff667754dd682ade56eaaca7049" + } + }, + "10e1aa374ebd43a9b083d9b7aab95b23": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f251ac2cd1ed45d4a7d25d6b906a84bc", + "placeholder": "​", + "style": "IPY_MODEL_824bc6d195ba465497fe34898880a7f8", + "value": "model.safetensors: 100%" + } + }, + "a5844989c8f648df82ee6414eeec9a21": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_05d85f70379b486db128698df001384e", + "max": 1109846632, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_7c620665c5964164adaf3539a0cb5ff5", + "value": 1109846632 + } + }, + "d02f4f39378b4e568f0f9382efd6a9d1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f708d4e670c54cfe93d9ba82818590dc", + "placeholder": "​", + "style": "IPY_MODEL_0be61c031fad4f31877ad29119b6a77e", + "value": " 1.11G/1.11G [00:22<00:00, 52.5MB/s]" + } + }, + "76f48ff667754dd682ade56eaaca7049": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f251ac2cd1ed45d4a7d25d6b906a84bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "824bc6d195ba465497fe34898880a7f8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "05d85f70379b486db128698df001384e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "7c620665c5964164adaf3539a0cb5ff5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f708d4e670c54cfe93d9ba82818590dc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0be61c031fad4f31877ad29119b6a77e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "54210209428948189c549c1d2cd939ca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_e77456b2ef2a46ef99572a02e1b70817", + "IPY_MODEL_3fbcd89672f842bf9692ebffd72d26c7", + "IPY_MODEL_f87349ba371f41d788f802a64f9312f9" + ], + "layout": "IPY_MODEL_c217c65b12264609862aeaeb1c25adc0" + } + }, + "e77456b2ef2a46ef99572a02e1b70817": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2bef4b3dc3cd457fa4ccb2e796d771f7", + "placeholder": "​", + "style": "IPY_MODEL_240bdca5cc794ae8b1319310e42aec2c", + "value": "tokenizer_config.json: 100%" + } + }, + "3fbcd89672f842bf9692ebffd72d26c7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_389d3448663f4389afa71734e6cc1434", + "max": 79, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_937847a9252247979ed76e17dc60f7b6", + "value": 79 + } + }, + "f87349ba371f41d788f802a64f9312f9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_cb0069caad574477866382fa085508d5", + "placeholder": "​", + "style": "IPY_MODEL_1f3d6a7de82f4710a79d814b6af57679", + "value": " 79.0/79.0 [00:00<00:00, 3.95kB/s]" + } + }, + "c217c65b12264609862aeaeb1c25adc0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2bef4b3dc3cd457fa4ccb2e796d771f7": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "240bdca5cc794ae8b1319310e42aec2c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "389d3448663f4389afa71734e6cc1434": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "937847a9252247979ed76e17dc60f7b6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "cb0069caad574477866382fa085508d5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1f3d6a7de82f4710a79d814b6af57679": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5473ab4a957b414aa344f37f921d89a2": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_423cf810d1254518af9e4fc79f97e54e", + "IPY_MODEL_83c2346cea5c4928a23f4fa38d2ffe40", + "IPY_MODEL_e2fde33a78ff4172b806b1530bda4e9a" + ], + "layout": "IPY_MODEL_d1b3fb0032ce400c80922d128940333c" + } + }, + "423cf810d1254518af9e4fc79f97e54e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_96b57a70103f4ac9b014b375e1aee824", + "placeholder": "​", + "style": "IPY_MODEL_74b28f16de69498b9915e501863b4d73", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "83c2346cea5c4928a23f4fa38d2ffe40": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_8338b89888584db5944d0ad5baa9118f", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_8baaa8a868ec4e0b98388e565d61a3db", + "value": 5069051 + } + }, + "e2fde33a78ff4172b806b1530bda4e9a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_9c1d4b12b247412295fe57637f7d2e60", + "placeholder": "​", + "style": "IPY_MODEL_ac6ad6631b054107b3287180cf3b9e68", + "value": " 5.07M/5.07M [00:00<00:00, 83.1MB/s]" + } + }, + "d1b3fb0032ce400c80922d128940333c": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "96b57a70103f4ac9b014b375e1aee824": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "74b28f16de69498b9915e501863b4d73": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8338b89888584db5944d0ad5baa9118f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "8baaa8a868ec4e0b98388e565d61a3db": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "9c1d4b12b247412295fe57637f7d2e60": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ac6ad6631b054107b3287180cf3b9e68": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "8c36c9eda216474d86fe73c161690a6d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_14d951a930d5478b9f4e77c430c2bb6f", + "IPY_MODEL_f6ae4d642a9f41cd829dde04347e50a3", + "IPY_MODEL_969548659ca74527afa6a0f92e3a98e7" + ], + "layout": "IPY_MODEL_7f90a349f8ce4525aa33649d70c09a33" + } + }, + "14d951a930d5478b9f4e77c430c2bb6f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_80fb8200d6d74cbca19ce946d73d9efa", + "placeholder": "​", + "style": "IPY_MODEL_67688fa958004ccfbacf0288f4075dea", + "value": "special_tokens_map.json: 100%" + } + }, + "f6ae4d642a9f41cd829dde04347e50a3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_fa8d4ffdee2d454cb14941089ff881d6", + "max": 150, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2577bab4d4a64010addd7c2b5370a6c8", + "value": 150 + } + }, + "969548659ca74527afa6a0f92e3a98e7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_01f9a013ab38435191872a8bce64dd69", + "placeholder": "​", + "style": "IPY_MODEL_0d91f57bfa25489eb70a23f2bd834cdf", + "value": " 150/150 [00:00<00:00, 10.5kB/s]" + } + }, + "7f90a349f8ce4525aa33649d70c09a33": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "80fb8200d6d74cbca19ce946d73d9efa": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "67688fa958004ccfbacf0288f4075dea": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "fa8d4ffdee2d454cb14941089ff881d6": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2577bab4d4a64010addd7c2b5370a6c8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "01f9a013ab38435191872a8bce64dd69": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0d91f57bfa25489eb70a23f2bd834cdf": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb new file mode 100644 index 00000000000000..4a1a54cef9cc6b --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb @@ -0,0 +1,2173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "VjZY8Zs2nOZy" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForSequenceClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_7VIYuX3nOZ1" + }, + "source": [ + "## Import ONNX XlmRoBertaForSequenceClassification models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `XlmRoBertaForSequenceClassification` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import XLM-RoBERTa models trained/fine-tuned for sequence classification via `XlmRoBertaForSequenceClassification` or `TFXlmRoBertaForSequenceClassification`. These models are usually under `Text Classification` category and have `xlm-roberta` in their labels\n", + "- Reference: [TFXlmRoBertaForSequenceClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForSequenceClassification)\n", + "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=text-classification)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HZGLjeyxnOZ1" + }, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "za0o-flhnOZ3" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VgbvC400nOZ3", + "outputId": "8ee99c2f-2a8b-4db9-d5f1-84092eff5f65", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m23.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m29.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m65.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m26.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m51.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m59.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m58.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m62.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m57.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m62.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m60.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m61.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m47.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m45.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m10.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m17.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m15.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oCS-FetznOZ4" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [cardiffnlp/twitter-xlm-roberta-base-sentiment](https://huggingface.co/cardiffnlp/twitter-xlm-roberta-base-sentiment) model from HuggingFace as an example and load it as a `ORTModelForSequenceClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VREupl3enOZ4", + "outputId": "f5d0f00f-081b-4f29-ad6f-eadd38e94d4d", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 373, + "referenced_widgets": [ + "0f95c2f60eca422a8b484df66cd738e7", + "251aee8a46f84ddbb3dc6a092f041911", + "d01d4a2dcdbb4a88a9ec3df55e949f51", + "39b3eeec515b46109ebd01a3c81cf839", + "81386512a80d4c7986543ae245e2b128", + "1aa9c4c2da4e499c8eb25eff14729039", + "fb4a285fffd9454da534302c6fc17e7d", + "861430374a7843d78a5e2499252f4e78", + "4e476a3c03044b95a5d951a36643dfe5", + "e38b9de9a6da4058956892c87ab4a29a", + "33887ca8a56f44b8ab7b78dcdb604e5e", + "d39f1f408a7241b8bb14dc1fd4ef9df6", + "29a548e9b0484038b62d5f57175e3d58", + "cbe05ca4a37745908898d47bb56c9774", + "eb413258aa00462195161ba8b0047bca", + "b6795898eeb24ed284b2bbd31040697a", + "905c59c8b7bd42a9a305ecb52d93f875", + "1a3c1b1b2d4d45fb96186d0d4b87d746", + "3dcbba3ef4524613848833f7eddc7bf3", + "28315f4670194975a577d772b73fa439", + "f83361dcbeaa4a8b9ddbbe47dc28e3f1", + "9f664ab32d0d43f5bddfe20417c10131", + "e0ed0155dd944f5ebbcc905509f01b20", + "7ec33b66132449a1804b9dd655dba44e", + "1a177346302c400097ede29b9ddcdde3", + "a5ecd770b12843a7af3fad2dae5be8e3", + "c501c8ec56b340d49e6c6698d325398d", + "54d98bec988246afa575f514b5fb538e", + "e33aaf9937d346619cfafb14e2f2257e", + "98c0bef2c2004631a1eea7dbeeb474ef", + "4bcdfb214d224488bd647edf528a6474", + "ffc6bd7026344b7aad522b9a2337bb6f", + "3c088ffa5f1045a59667dfd0b5024db6", + "2a6ea6ad829149abbb37ec45d0721651", + "5c90d0f385f446a1a933231fd82f7c5c", + "06bdcebdd8634c08be2a5c7edba7f20f", + "5e85cd1b82374b5b9c7124a99c13784e", + "6615c07fdecb4049b19f7a2178c3879d", + "db3f53eaa65c44f987b410a60e0c04de", + "a153a09450f8403998482ec3cf5e5424", + "2209890ded0b42b2b11a53aaadc1dfc3", + "ecf37d50caa6458fbaf4dc9300961c83", + "babd560899554c3ab5eb12bbab99938b", + "26aded6abf1242e8910a8051ee80f609" + ] + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/841 [00:00 False\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForSequenceClassification\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForSequenceClassification.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OtCJ9qBvnOZ5" + }, + "source": [ + "Let's have a look inside this and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qIBR7cAqnOZ5", + "outputId": "49d6906d-a710-4d12-e547-3c7638ec1ab4", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 1108048\n", + "-rw-r--r-- 1 root root 915 Jan 9 19:15 config.json\n", + "-rw-r--r-- 1 root root 1112465741 Jan 9 19:15 model.onnx\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:15 sentencepiece.bpe.model\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:15 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 471 Jan 9 19:15 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:15 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-Sp8nJu7nOZ5" + }, + "source": [ + "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for\n", + "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2KsG42cmnOZ5" + }, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "fKElx2rtnOZ5" + }, + "outputs": [], + "source": [ + "# get label2id dictionary\n", + "labels = ort_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JrxxMgNpnOZ5" + }, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WaxO1clenOZ6" + }, + "source": [ + "Voila! We have our `sentencepiece.bpe.model` and `labels.txt` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C_sD6vcDnOZ6", + "outputId": "ee31714d-f3ff-4e7c-874f-d9f3a2358700", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "onnx_models/cardiffnlp/twitter-xlm-roberta-base-sentiment:\n", + "total 1103100\n", + "drwxr-xr-x 2 root root 4096 Jan 9 19:17 assets\n", + "-rw-r--r-- 1 root root 915 Jan 9 19:15 config.json\n", + "-rw-r--r-- 1 root root 1112465741 Jan 9 19:15 model.onnx\n", + "-rw-r--r-- 1 root root 167 Jan 9 19:15 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 471 Jan 9 19:15 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082730 Jan 9 19:15 tokenizer.json\n", + "\n", + "onnx_models/cardiffnlp/twitter-xlm-roberta-base-sentiment/assets:\n", + "total 4956\n", + "-rw-r--r-- 1 root root 25 Jan 9 19:16 labels.txt\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:15 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WoRaIuTgnOZ6" + }, + "source": [ + "## Import and Save RoBertaForSequenceClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rmyTRnmTnOZ6" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VjgCKRjxnOZ6", + "outputId": "f8d62151-4ae3-4212-d2e6-be61f24cfcc8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-10-16 21:08:22-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2023-10-16 21:08:23-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: β€˜STDOUT’\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-10-16 21:08:23 (93.8 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m21.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-QbKgNWUnOZ6" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "t8nE1WMKnOZ6", + "outputId": "58c3086e-cb83-4472-f0a0-07d87ee70371" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yCoPZcMmnOZ6" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `XlmRoBertaForSequenceClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `XlmRoBertaForSequenceClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hwPVKZyinOZ6" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "sequenceClassifier = XlmRoBertaForSequenceClassification.loadSavedModel(\n", + " ONNX_MODEL,\n", + " spark\n", + " )\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBaSiegrnOZ6" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wJM6A2ZMnOZ6" + }, + "outputs": [], + "source": [ + "sequenceClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BSseNI1ZnOZ6" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-t_ST7fznOZ6" + }, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HrIRyrwJnOZ7" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your XlmRoBertaForSequenceClassification model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "x18SNGz5nOZ7", + "outputId": "b58ae4c0-385a-49f7-a989-9f43c1654648" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 487524\n", + "drwxr-xr-x 5 root root 4096 Oct 16 21:15 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 16 21:15 metadata\n", + "-rw-r--r-- 1 root root 499209257 Oct 16 21:16 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CNG-mf3nnOZ7" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForSequenceClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "E-YVoU8xnOZ7" + }, + "outputs": [], + "source": [ + "sequenceClassifier_loaded = XlmRoBertaForSequenceClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"class\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VpFTaC7GnOZ7" + }, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OkxqlnoBnOZ7", + "outputId": "2dd0576e-8abe-4d8b-8e2c-598783ba116a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['disgust',\n", + " 'optimism',\n", + " 'embarrassment',\n", + " 'amusement',\n", + " 'realization',\n", + " 'surprise',\n", + " 'grief',\n", + " 'caring',\n", + " 'disapproval',\n", + " 'disappointment',\n", + " 'joy',\n", + " 'confusion',\n", + " 'excitement',\n", + " 'approval',\n", + " 'curiosity',\n", + " 'anger',\n", + " 'love',\n", + " 'admiration',\n", + " 'gratitude',\n", + " 'annoyance',\n", + " 'remorse',\n", + " 'nervousness',\n", + " 'neutral',\n", + " 'pride',\n", + " 'fear',\n", + " 'sadness',\n", + " 'desire',\n", + " 'relief']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "sequenceClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c62SdOTdnOZ7" + }, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP πŸš€ pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Di5xRn1nOZ7" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " sequenceClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"I love you!\"], ['I feel lucky to be here.']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"class.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_ka-wmU-nOZ7" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `XlmRoBertaForSequenceClassification` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "0f95c2f60eca422a8b484df66cd738e7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_251aee8a46f84ddbb3dc6a092f041911", + "IPY_MODEL_d01d4a2dcdbb4a88a9ec3df55e949f51", + "IPY_MODEL_39b3eeec515b46109ebd01a3c81cf839" + ], + "layout": "IPY_MODEL_81386512a80d4c7986543ae245e2b128" + } + }, + "251aee8a46f84ddbb3dc6a092f041911": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1aa9c4c2da4e499c8eb25eff14729039", + "placeholder": "​", + "style": "IPY_MODEL_fb4a285fffd9454da534302c6fc17e7d", + "value": "config.json: 100%" + } + }, + "d01d4a2dcdbb4a88a9ec3df55e949f51": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_861430374a7843d78a5e2499252f4e78", + "max": 841, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4e476a3c03044b95a5d951a36643dfe5", + "value": 841 + } + }, + "39b3eeec515b46109ebd01a3c81cf839": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e38b9de9a6da4058956892c87ab4a29a", + "placeholder": "​", + "style": "IPY_MODEL_33887ca8a56f44b8ab7b78dcdb604e5e", + "value": " 841/841 [00:00<00:00, 52.8kB/s]" + } + }, + "81386512a80d4c7986543ae245e2b128": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1aa9c4c2da4e499c8eb25eff14729039": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fb4a285fffd9454da534302c6fc17e7d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "861430374a7843d78a5e2499252f4e78": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4e476a3c03044b95a5d951a36643dfe5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "e38b9de9a6da4058956892c87ab4a29a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "33887ca8a56f44b8ab7b78dcdb604e5e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "d39f1f408a7241b8bb14dc1fd4ef9df6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_29a548e9b0484038b62d5f57175e3d58", + "IPY_MODEL_cbe05ca4a37745908898d47bb56c9774", + "IPY_MODEL_eb413258aa00462195161ba8b0047bca" + ], + "layout": "IPY_MODEL_b6795898eeb24ed284b2bbd31040697a" + } + }, + "29a548e9b0484038b62d5f57175e3d58": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_905c59c8b7bd42a9a305ecb52d93f875", + "placeholder": "​", + "style": "IPY_MODEL_1a3c1b1b2d4d45fb96186d0d4b87d746", + "value": "pytorch_model.bin: 100%" + } + }, + "cbe05ca4a37745908898d47bb56c9774": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3dcbba3ef4524613848833f7eddc7bf3", + "max": 1112271561, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_28315f4670194975a577d772b73fa439", + "value": 1112271561 + } + }, + "eb413258aa00462195161ba8b0047bca": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f83361dcbeaa4a8b9ddbbe47dc28e3f1", + "placeholder": "​", + "style": "IPY_MODEL_9f664ab32d0d43f5bddfe20417c10131", + "value": " 1.11G/1.11G [00:07<00:00, 180MB/s]" + } + }, + "b6795898eeb24ed284b2bbd31040697a": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "905c59c8b7bd42a9a305ecb52d93f875": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "1a3c1b1b2d4d45fb96186d0d4b87d746": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "3dcbba3ef4524613848833f7eddc7bf3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "28315f4670194975a577d772b73fa439": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "f83361dcbeaa4a8b9ddbbe47dc28e3f1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "9f664ab32d0d43f5bddfe20417c10131": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e0ed0155dd944f5ebbcc905509f01b20": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7ec33b66132449a1804b9dd655dba44e", + "IPY_MODEL_1a177346302c400097ede29b9ddcdde3", + "IPY_MODEL_a5ecd770b12843a7af3fad2dae5be8e3" + ], + "layout": "IPY_MODEL_c501c8ec56b340d49e6c6698d325398d" + } + }, + "7ec33b66132449a1804b9dd655dba44e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_54d98bec988246afa575f514b5fb538e", + "placeholder": "​", + "style": "IPY_MODEL_e33aaf9937d346619cfafb14e2f2257e", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "1a177346302c400097ede29b9ddcdde3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_98c0bef2c2004631a1eea7dbeeb474ef", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4bcdfb214d224488bd647edf528a6474", + "value": 5069051 + } + }, + "a5ecd770b12843a7af3fad2dae5be8e3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_ffc6bd7026344b7aad522b9a2337bb6f", + "placeholder": "​", + "style": "IPY_MODEL_3c088ffa5f1045a59667dfd0b5024db6", + "value": " 5.07M/5.07M [00:00<00:00, 124MB/s]" + } + }, + "c501c8ec56b340d49e6c6698d325398d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "54d98bec988246afa575f514b5fb538e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e33aaf9937d346619cfafb14e2f2257e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "98c0bef2c2004631a1eea7dbeeb474ef": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4bcdfb214d224488bd647edf528a6474": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "ffc6bd7026344b7aad522b9a2337bb6f": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "3c088ffa5f1045a59667dfd0b5024db6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2a6ea6ad829149abbb37ec45d0721651": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_5c90d0f385f446a1a933231fd82f7c5c", + "IPY_MODEL_06bdcebdd8634c08be2a5c7edba7f20f", + "IPY_MODEL_5e85cd1b82374b5b9c7124a99c13784e" + ], + "layout": "IPY_MODEL_6615c07fdecb4049b19f7a2178c3879d" + } + }, + "5c90d0f385f446a1a933231fd82f7c5c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_db3f53eaa65c44f987b410a60e0c04de", + "placeholder": "​", + "style": "IPY_MODEL_a153a09450f8403998482ec3cf5e5424", + "value": "special_tokens_map.json: 100%" + } + }, + "06bdcebdd8634c08be2a5c7edba7f20f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_2209890ded0b42b2b11a53aaadc1dfc3", + "max": 150, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ecf37d50caa6458fbaf4dc9300961c83", + "value": 150 + } + }, + "5e85cd1b82374b5b9c7124a99c13784e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_babd560899554c3ab5eb12bbab99938b", + "placeholder": "​", + "style": "IPY_MODEL_26aded6abf1242e8910a8051ee80f609", + "value": " 150/150 [00:00<00:00, 7.91kB/s]" + } + }, + "6615c07fdecb4049b19f7a2178c3879d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "db3f53eaa65c44f987b410a60e0c04de": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a153a09450f8403998482ec3cf5e5424": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2209890ded0b42b2b11a53aaadc1dfc3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ecf37d50caa6458fbaf4dc9300961c83": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "babd560899554c3ab5eb12bbab99938b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "26aded6abf1242e8910a8051ee80f609": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb new file mode 100644 index 00000000000000..0cc16cf9245d9b --- /dev/null +++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb @@ -0,0 +1,2144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "PT2s_38mqpqS" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_XlmRoBertaForTokenClassification.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iss2RqRIqpqV" + }, + "source": [ + "## Import ONNX XlmRoBertaForTokenClassification models from HuggingFace πŸ€— into Spark NLP πŸš€\n", + "\n", + "Let's keep in mind a few things before we start 😊\n", + "\n", + "- ONNX support was introduced in `Spark NLP 5.0.0`, enabling high performance inference for models.\n", + "- `XlmRoBertaForTokenClassification` is only available since in `Spark NLP 5.2.3` and after. So please make sure you have upgraded to the latest Spark NLP release\n", + "- You can import XLM-RoBERTa models trained/fine-tuned for token classification via `XlmRoBertaForTokenClassification` or `TFXlmRoBertaForTokenClassification`. These models are usually under `Token Classification` category and have `roberta` in their labels\n", + "- Reference: [TFXlmRoBertaForTokenClassification](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.TFXLMRobertaForTokenClassification)\n", + "- Some [example models](https://huggingface.co/models?filter=xlm-roberta&pipeline_tag=token-classification)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yhZZmLjgqpqX" + }, + "source": [ + "## Export and Save HuggingFace model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rUErF-PMqpqX" + }, + "source": [ + "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n", + "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases\n", + "- Albert uses SentencePiece, so we will have to install that as well" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "IairJqDCqpqY", + "outputId": "b93fb73a-4bb0-442b-f6da-7228362ef1ee", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m403.3/403.3 kB\u001b[0m \u001b[31m19.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m455.7/455.7 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m65.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m38.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m15.7/15.7 MB\u001b[0m \u001b[31m65.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m70.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m97.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m111.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.4/6.4 MB\u001b[0m \u001b[31m102.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m100.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m102.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m109.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m96.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m100.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m81.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m44.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m100.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m475.2/475.2 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.9/489.9 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m67.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m35.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m64.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m67.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m489.8/489.8 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.7/479.7 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m84.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m104.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.8/440.8 kB\u001b[0m \u001b[31m45.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m524.1/524.1 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m81.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m440.7/440.7 kB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m76.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m781.3/781.3 kB\u001b[0m \u001b[31m62.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m98.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m585.9/585.9 MB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m588.3/588.3 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m86.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.0/6.0 MB\u001b[0m \u001b[31m105.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m439.2/439.2 kB\u001b[0m \u001b[31m43.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m98.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.8/83.8 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m42.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m43.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m451.2/451.2 kB\u001b[0m \u001b[31m42.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m78.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m79.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m93.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.1/13.1 MB\u001b[0m \u001b[31m28.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.\n", + "tensorflow-datasets 4.9.4 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.\n", + "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.6 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum tensorflow" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jjV1pA-nqpqY" + }, + "source": [ + "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n", + "- We'll use [xlm-roberta-large-finetuned-conll03-english](https://huggingface.co/xlm-roberta-large-finetuned-conll03-english) model from HuggingFace as an example and load it as a `ORTModelForTokenClassification`, representing an ONNX model." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "WATHU1YWqpqY", + "outputId": "9c48ca4b-a967-418b-b350-a7a5827be281", + "colab": { + "referenced_widgets": [ + "1544c9cc98bc469b98ae804569204420", + "94da2b3a4e7743a2b7dd362292c5496f", + "6494c0f839ff4418a93df6e88c012d07", + "df8afa14db524e2992b832b30bc0f692", + "78b0860d0ed643d785ef00633d9e17e8", + "451d9aec16b8417fb3b9565e5a73cb52", + "e7df5ab266744d59b29e8c11106dcb65", + "547e218edc3d47cab11d876641955409", + "86abb923170a4927a99c0289540ecdf4", + "dff16b39b66a4422a407096064fa182e", + "119f12a1e8204bf7b7bbf1b4d7cca247", + "e53d77092ea646b5bff9e5c4051f0709", + "7538f72687754318ac6657cd98f1f5ae", + "b80a55635ff342eaa98451556f4908d3", + "b3ee928046b94c9194b5b5e30c61becb", + "a2f65e14834a47e69687d32ee896d31d", + "e0e53ea997404498850c7dff6f80a5fb", + "68e4ba7bf6c5483abcae494fcdd46c6a", + "349c635d1e8c47d6ab1d0fe3819dd837", + "eded9c7da1eb4f6c9e713c8b23b4327a", + "59a857c998734b0a95af2b96252aa130", + "ace71ecb2fae4e4196f5c75b86a522f8", + "ee4965a2b5ad435c8c82b377006aa73e", + "9c3ec5377e884b3faebd24fb815b6a85", + "9ca46de0beaa4c5d9ec526820d7aa94f", + "238132625e604ddf85bdbf4931889d51", + "61335821e4c94fcba501cb1c94541d07", + "883c34b77a4c4558b74d5dde797e22ab", + "a85eea6afaf0480eac17817e4844539a", + "7277b2423de14e3aada27e5191f096e5", + "0faedd1c4d4148fa965bcec52325bd08", + "c2ab7313174c4231825be28c4a3181b8", + "45123f4cdc0a4aa8ac90aa29d357240e", + "0bd10f7cb5244da29d0a7da73ae52335", + "825857db473849d2bb498ffb5fcfb962", + "a88c300d3a81439fb3da9d46a023dc47", + "636fccb3b002475a90c888f987b36400", + "89c7d83dc8e640cbb93ccfe2bb3030f0", + "87671904fcdc4af993d4ba61f3a5f9e9", + "2510ee2600af466a851566f4634e7fe9", + "44e1f77b21e04d32ad83a405ab62ca38", + "106c462bc57243018162577b103db007", + "1826afdb3fd94748941c71d8621682e3", + "21ca8729098a4bd498b29de51a92e8bd" + ], + "base_uri": "https://localhost:8080/", + "height": 391 + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "config.json: 0%| | 0.00/852 [00:00 False\n", + "Saving external data to one file...\n" + ] + } + ], + "source": [ + "from optimum.onnxruntime import ORTModelForTokenClassification\n", + "import tensorflow as tf\n", + "\n", + "MODEL_NAME = 'xlm-roberta-large-finetuned-conll03-english'\n", + "ONNX_MODEL = f\"onnx_models/{MODEL_NAME}\"\n", + "\n", + "ort_model = ORTModelForTokenClassification.from_pretrained(MODEL_NAME, export=True)\n", + "\n", + "# Save the ONNX model\n", + "ort_model.save_pretrained(ONNX_MODEL)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lH-d7yGXqpqZ" + }, + "source": [ + "Let's have a look inside the directory and see what we are dealing with:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "fp6t5TETqpqZ", + "outputId": "3eab09a6-51c3-48f8-d8e5-74e76fe22585", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "total 2205260\n", + "-rw-r--r-- 1 root root 1046 Jan 9 19:26 config.json\n", + "-rw-r--r-- 1 root root 617783 Jan 9 19:26 model.onnx\n", + "-rw-r--r-- 1 root root 2235396096 Jan 9 19:26 model.onnx_data\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:26 sentencepiece.bpe.model\n", + "-rw-r--r-- 1 root root 280 Jan 9 19:26 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 418 Jan 9 19:26 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082660 Jan 9 19:26 tokenizer.json\n" + ] + } + ], + "source": [ + "!ls -l {ONNX_MODEL}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "u5kdjGpdqpqZ" + }, + "outputs": [], + "source": [ + "!mkdir {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q53KnN90qpqZ" + }, + "source": [ + "- As you can see, we need to move `sentencepiece.bpe.model` from the tokenizer to `assets` folder which Spark NLP will look for\n", + "- We also need `labels` and their `ids` which is saved inside the model's config. We will save this inside `labels.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "NYYI6xnTqpqa" + }, + "outputs": [], + "source": [ + "# get label2id dictionary\n", + "labels = ort_model.config.id2label\n", + "# sort the dictionary based on the id\n", + "labels = [value for key,value in sorted(labels.items(), reverse=False)]\n", + "\n", + "with open(ONNX_MODEL + '/assets/labels.txt', 'w') as f:\n", + " f.write('\\n'.join(labels))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "hx6uf2PPqpqa" + }, + "outputs": [], + "source": [ + "!mv {ONNX_MODEL}/sentencepiece.bpe.model {ONNX_MODEL}/assets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "idrz2RCWqpqa" + }, + "source": [ + "Voila! We have our `sentencepiece.bpe.model` inside assets directory" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "T5YSOXhLqpqa", + "outputId": "028c70df-7f80-4bf6-9779-e8b26ee574aa", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "onnx_models/xlm-roberta-large-finetuned-conll03-english:\n", + "total 2200312\n", + "drwxr-xr-x 2 root root 4096 Jan 9 19:26 assets\n", + "-rw-r--r-- 1 root root 1046 Jan 9 19:26 config.json\n", + "-rw-r--r-- 1 root root 617783 Jan 9 19:26 model.onnx\n", + "-rw-r--r-- 1 root root 2235396096 Jan 9 19:26 model.onnx_data\n", + "-rw-r--r-- 1 root root 280 Jan 9 19:26 special_tokens_map.json\n", + "-rw-r--r-- 1 root root 418 Jan 9 19:26 tokenizer_config.json\n", + "-rw-r--r-- 1 root root 17082660 Jan 9 19:26 tokenizer.json\n", + "\n", + "onnx_models/xlm-roberta-large-finetuned-conll03-english/assets:\n", + "total 4956\n", + "-rw-r--r-- 1 root root 45 Jan 9 19:26 labels.txt\n", + "-rw-r--r-- 1 root root 5069051 Jan 9 19:26 sentencepiece.bpe.model\n" + ] + } + ], + "source": [ + "!ls -lR {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yC4lfOb_qpqb" + }, + "source": [ + "## Import and Save RoBertaForTokenClassification in Spark NLP\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_T57R-wBqpqb" + }, + "source": [ + "- Let's install and setup Spark NLP in Google Colab\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vqzkT2Tbqpqb", + "outputId": "3d1b295e-e6e9-409c-f73f-e778352aa7ff" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.1.3\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.3\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m537.5/537.5 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e_C8Rt6Iqpqb" + }, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "44kpKSG-qpqb", + "outputId": "d556353a-cd63-4e2a-a5e6-fcfbfa72fa57" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "# let's start Spark with Spark NLP\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Apache Spark version: {}\".format(spark.version))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IHPVcE9nqpqc" + }, + "source": [ + "- Let's use `loadSavedModel` functon in `RoBertaForTokenClassification` which allows us to load TensorFlow model in SavedModel format\n", + "- Most params can be set later when you are loading this model in `RoBertaForTokenClassification` in runtime like `setMaxSentenceLength`, so don't worry what you are setting them now\n", + "- `loadSavedModel` accepts two params, first is the path to the TF SavedModel. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-_OWtRBHqpqc" + }, + "outputs": [], + "source": [ + "from sparknlp.annotator import *\n", + "from sparknlp.base import *\n", + "\n", + "tokenClassifier = RoBertaForTokenClassification\\\n", + " .loadSavedModel(ONNX_MODEL, spark)\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")\\\n", + " .setCaseSensitive(True)\\\n", + " .setMaxSentenceLength(128)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cgoFul55qpqc" + }, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EzK8inoxqpqc" + }, + "outputs": [], + "source": [ + "tokenClassifier.write().overwrite().save(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0VwXvPlbqpqc" + }, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1Ce2ZAEtqpqc" + }, + "outputs": [], + "source": [ + "!rm -rf {ONNX_MODEL}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Z4QXBzVsqpqd" + }, + "source": [ + "Awesome 😎 !\n", + "\n", + "This is your RoBertaForTokenClassification model from HuggingFace πŸ€— loaded and saved by Spark NLP πŸš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wRKVUu9tqpqd", + "outputId": "2278d83a-63be-4e1a-b574-29c963b4b7a1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 318696\n", + "drwxr-xr-x 5 root root 4096 Oct 16 22:21 fields\n", + "drwxr-xr-x 2 root root 4096 Oct 16 22:21 metadata\n", + "-rw-r--r-- 1 root root 326328924 Oct 16 22:21 roberta_classification_onnx\n" + ] + } + ], + "source": [ + "! ls -l {ONNX_MODEL}_spark_nlp_onnx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qXl-kXeLqpqd" + }, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny RoBertaForTokenClassification model 😊" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QTrtB8u7qpqd" + }, + "outputs": [], + "source": [ + "tokenClassifier_loaded = RoBertaForTokenClassification.load(\"./{}_spark_nlp_onnx\".format(ONNX_MODEL))\\\n", + " .setInputCols([\"document\",'token'])\\\n", + " .setOutputCol(\"ner\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UDHQves4qpqd" + }, + "source": [ + "You can see what labels were used to train this model via `getClasses` function:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "01Aw5e47qpqe", + "outputId": "69cfded4-763c-41a3-f7ea-4ef56a744741" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['B-LOC', 'I-ORG', 'I-LOC', 'I-PER', 'B-ORG', 'O', 'B-PER']" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# .getClasses was introduced in spark-nlp==3.4.0\n", + "tokenClassifier_loaded.getClasses()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F7Kbxqvxqpqe" + }, + "source": [ + "This is how you can use your loaded classifier model in Spark NLP πŸš€ pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2f0E2Gvxqpqe", + "outputId": "fc05a614-cc89-417b-fad8-1290b34905e0" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+\n", + "| text| result|\n", + "+--------------------+--------------------+\n", + "|My name is Clara ...|[O, O, O, B-PER, ...|\n", + "|My name is Clara ...|[O, O, O, B-PER, ...|\n", + "+--------------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "document_assembler = DocumentAssembler() \\\n", + " .setInputCol('text') \\\n", + " .setOutputCol('document')\n", + "\n", + "tokenizer = Tokenizer() \\\n", + " .setInputCols(['document']) \\\n", + " .setOutputCol('token')\n", + "\n", + "pipeline = Pipeline(stages=[\n", + " document_assembler,\n", + " tokenizer,\n", + " tokenClassifier_loaded\n", + "])\n", + "\n", + "# couple of simple examples\n", + "example = spark.createDataFrame([[\"My name is Clara and I live in Berkeley, California.\"], ['My name is Clara and I live in Berkeley, California.']]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(example).transform(example)\n", + "\n", + "# result is a DataFrame\n", + "result.select(\"text\", \"ner.result\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WOGLNugSqpqe" + }, + "source": [ + "That's it! You can now go wild and use hundreds of `RoBertaForTokenClassification` models from HuggingFace πŸ€— in Spark NLP πŸš€\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1544c9cc98bc469b98ae804569204420": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_94da2b3a4e7743a2b7dd362292c5496f", + "IPY_MODEL_6494c0f839ff4418a93df6e88c012d07", + "IPY_MODEL_df8afa14db524e2992b832b30bc0f692" + ], + "layout": "IPY_MODEL_78b0860d0ed643d785ef00633d9e17e8" + } + }, + "94da2b3a4e7743a2b7dd362292c5496f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_451d9aec16b8417fb3b9565e5a73cb52", + "placeholder": "​", + "style": "IPY_MODEL_e7df5ab266744d59b29e8c11106dcb65", + "value": "config.json: 100%" + } + }, + "6494c0f839ff4418a93df6e88c012d07": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_547e218edc3d47cab11d876641955409", + "max": 852, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_86abb923170a4927a99c0289540ecdf4", + "value": 852 + } + }, + "df8afa14db524e2992b832b30bc0f692": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_dff16b39b66a4422a407096064fa182e", + "placeholder": "​", + "style": "IPY_MODEL_119f12a1e8204bf7b7bbf1b4d7cca247", + "value": " 852/852 [00:00<00:00, 13.4kB/s]" + } + }, + "78b0860d0ed643d785ef00633d9e17e8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "451d9aec16b8417fb3b9565e5a73cb52": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e7df5ab266744d59b29e8c11106dcb65": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "547e218edc3d47cab11d876641955409": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "86abb923170a4927a99c0289540ecdf4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "dff16b39b66a4422a407096064fa182e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "119f12a1e8204bf7b7bbf1b4d7cca247": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "e53d77092ea646b5bff9e5c4051f0709": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7538f72687754318ac6657cd98f1f5ae", + "IPY_MODEL_b80a55635ff342eaa98451556f4908d3", + "IPY_MODEL_b3ee928046b94c9194b5b5e30c61becb" + ], + "layout": "IPY_MODEL_a2f65e14834a47e69687d32ee896d31d" + } + }, + "7538f72687754318ac6657cd98f1f5ae": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e0e53ea997404498850c7dff6f80a5fb", + "placeholder": "​", + "style": "IPY_MODEL_68e4ba7bf6c5483abcae494fcdd46c6a", + "value": "model.safetensors: 100%" + } + }, + "b80a55635ff342eaa98451556f4908d3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_349c635d1e8c47d6ab1d0fe3819dd837", + "max": 2239643256, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_eded9c7da1eb4f6c9e713c8b23b4327a", + "value": 2239643256 + } + }, + "b3ee928046b94c9194b5b5e30c61becb": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_59a857c998734b0a95af2b96252aa130", + "placeholder": "​", + "style": "IPY_MODEL_ace71ecb2fae4e4196f5c75b86a522f8", + "value": " 2.24G/2.24G [00:17<00:00, 173MB/s]" + } + }, + "a2f65e14834a47e69687d32ee896d31d": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e0e53ea997404498850c7dff6f80a5fb": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "68e4ba7bf6c5483abcae494fcdd46c6a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "349c635d1e8c47d6ab1d0fe3819dd837": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "eded9c7da1eb4f6c9e713c8b23b4327a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "59a857c998734b0a95af2b96252aa130": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ace71ecb2fae4e4196f5c75b86a522f8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "ee4965a2b5ad435c8c82b377006aa73e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_9c3ec5377e884b3faebd24fb815b6a85", + "IPY_MODEL_9ca46de0beaa4c5d9ec526820d7aa94f", + "IPY_MODEL_238132625e604ddf85bdbf4931889d51" + ], + "layout": "IPY_MODEL_61335821e4c94fcba501cb1c94541d07" + } + }, + "9c3ec5377e884b3faebd24fb815b6a85": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_883c34b77a4c4558b74d5dde797e22ab", + "placeholder": "​", + "style": "IPY_MODEL_a85eea6afaf0480eac17817e4844539a", + "value": "sentencepiece.bpe.model: 100%" + } + }, + "9ca46de0beaa4c5d9ec526820d7aa94f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7277b2423de14e3aada27e5191f096e5", + "max": 5069051, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_0faedd1c4d4148fa965bcec52325bd08", + "value": 5069051 + } + }, + "238132625e604ddf85bdbf4931889d51": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c2ab7313174c4231825be28c4a3181b8", + "placeholder": "​", + "style": "IPY_MODEL_45123f4cdc0a4aa8ac90aa29d357240e", + "value": " 5.07M/5.07M [00:00<00:00, 41.3MB/s]" + } + }, + "61335821e4c94fcba501cb1c94541d07": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "883c34b77a4c4558b74d5dde797e22ab": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a85eea6afaf0480eac17817e4844539a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7277b2423de14e3aada27e5191f096e5": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "0faedd1c4d4148fa965bcec52325bd08": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c2ab7313174c4231825be28c4a3181b8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "45123f4cdc0a4aa8ac90aa29d357240e": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "0bd10f7cb5244da29d0a7da73ae52335": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_825857db473849d2bb498ffb5fcfb962", + "IPY_MODEL_a88c300d3a81439fb3da9d46a023dc47", + "IPY_MODEL_636fccb3b002475a90c888f987b36400" + ], + "layout": "IPY_MODEL_89c7d83dc8e640cbb93ccfe2bb3030f0" + } + }, + "825857db473849d2bb498ffb5fcfb962": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_87671904fcdc4af993d4ba61f3a5f9e9", + "placeholder": "​", + "style": "IPY_MODEL_2510ee2600af466a851566f4634e7fe9", + "value": "tokenizer.json: 100%" + } + }, + "a88c300d3a81439fb3da9d46a023dc47": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_44e1f77b21e04d32ad83a405ab62ca38", + "max": 9096718, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_106c462bc57243018162577b103db007", + "value": 9096718 + } + }, + "636fccb3b002475a90c888f987b36400": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_1826afdb3fd94748941c71d8621682e3", + "placeholder": "​", + "style": "IPY_MODEL_21ca8729098a4bd498b29de51a92e8bd", + "value": " 9.10M/9.10M [00:00<00:00, 25.6MB/s]" + } + }, + "89c7d83dc8e640cbb93ccfe2bb3030f0": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "87671904fcdc4af993d4ba61f3a5f9e9": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2510ee2600af466a851566f4634e7fe9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "44e1f77b21e04d32ad83a405ab62ca38": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "106c462bc57243018162577b103db007": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "1826afdb3fd94748941c71d8621682e3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "21ca8729098a4bd498b29de51a92e8bd": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala index 11e29c8c004474..2231bdb91592b1 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala @@ -108,7 +108,7 @@ private[johnsnowlabs] class AlbertClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch, maxSentenceLength, sequence = true) + case ONNX.name => getRawScoresWithOnnx(batch, maxSentenceLength, sequence = true) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -128,7 +128,7 @@ private[johnsnowlabs] class AlbertClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch, maxSentenceLength, sequence = true) + case ONNX.name => getRawScoresWithOnnx(batch, maxSentenceLength, sequence = true) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -203,7 +203,7 @@ private[johnsnowlabs] class AlbertClassification( rawScores } - private def getRowScoresWithOnnx( + private def getRawScoresWithOnnx( batch: Seq[Array[Int]], maxSentenceLength: Int, sequence: Boolean): Array[Float] = { diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala index 7b4dfaf233f879..1a38fe2b2864e9 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/BertClassification.scala @@ -149,7 +149,7 @@ private[johnsnowlabs] class BertClassification( val rawScores = detectedEngine match { case ONNX.name => - getRowScoresWithOnnx(batch, maxSentenceLength) + getRawScoresWithOnnx(batch, maxSentenceLength) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -218,7 +218,7 @@ private[johnsnowlabs] class BertClassification( rawScores } - private def getRowScoresWithOnnx( + private def getRawScoresWithOnnx( batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { @@ -265,7 +265,7 @@ private[johnsnowlabs] class BertClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { case ONNX.name => - getRowScoresWithOnnx(batch, maxSentenceLength) + getRawScoresWithOnnx(batch, maxSentenceLength) case _ => getRawScoresWithTF(batch, maxSentenceLength) } diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala index 0fee4f4043ac42..e7675367debbca 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/CamemBertClassification.scala @@ -123,7 +123,7 @@ private[johnsnowlabs] class CamemBertClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -189,7 +189,7 @@ private[johnsnowlabs] class CamemBertClassification( rawScores } - private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + private def getRawScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { // [nb of encoded sentences , maxSentenceLength] val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) @@ -227,7 +227,7 @@ private[johnsnowlabs] class CamemBertClassification( val batchLength = batch.length val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch, maxSentenceLength) } diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala index 881ba99607c481..965d70f2da767b 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/DeBertaClassification.scala @@ -109,7 +109,7 @@ private[johnsnowlabs] class DeBertaClassification( val batchLength = batch.length val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch) } @@ -182,7 +182,7 @@ private[johnsnowlabs] class DeBertaClassification( rawScores } - private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + private def getRawScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { // [nb of encoded sentences , maxSentenceLength] val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) @@ -219,7 +219,7 @@ private[johnsnowlabs] class DeBertaClassification( val batchLength = batch.length val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch) } diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/DistilBertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/DistilBertClassification.scala index 099622429ecf80..00c62faabbcc4f 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/DistilBertClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/DistilBertClassification.scala @@ -148,7 +148,7 @@ private[johnsnowlabs] class DistilBertClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -211,7 +211,7 @@ private[johnsnowlabs] class DistilBertClassification( rawScores } - private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + private def getRawScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) @@ -247,7 +247,7 @@ private[johnsnowlabs] class DistilBertClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch, maxSentenceLength) } diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala index 054d1eff76f2d5..85ec88e95caf0f 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/RoBertaClassification.scala @@ -141,7 +141,7 @@ private[johnsnowlabs] class RoBertaClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch, maxSentenceLength) } @@ -207,7 +207,7 @@ private[johnsnowlabs] class RoBertaClassification( rawScores } - private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + private def getRawScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { // [nb of encoded sentences , maxSentenceLength] val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) @@ -244,7 +244,7 @@ private[johnsnowlabs] class RoBertaClassification( val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val rawScores = detectedEngine match { - case ONNX.name => getRowScoresWithOnnx(batch) + case ONNX.name => getRawScoresWithOnnx(batch) case _ => getRawScoresWithTF(batch, maxSentenceLength) } diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala index bddf0da0bbd368..afb530647c1bc2 100644 --- a/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala +++ b/src/main/scala/com/johnsnowlabs/ml/ai/XlmRoBertaClassification.scala @@ -16,9 +16,12 @@ package com.johnsnowlabs.ml.ai +import ai.onnxruntime.OnnxTensor +import com.johnsnowlabs.ml.onnx.{OnnxSession, OnnxWrapper} import com.johnsnowlabs.ml.tensorflow.sentencepiece.{SentencePieceWrapper, SentencepieceEncoder} import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignatureManager} import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.{BasicTokenizer, WordpieceEncoder} import com.johnsnowlabs.nlp.{ActivationFunction, Annotation} @@ -27,28 +30,34 @@ import org.tensorflow.ndarray.buffer.IntDataBuffer import scala.collection.JavaConverters._ /** @param tensorflowWrapper - * XLM-RoBERTa Model wrapper with TensorFlow Wrapper - * @param spp - * XlmRoberta SentencePiece model with SentencePieceWrapper - * @param configProtoBytes - * Configuration for TensorFlow session - * @param tags - * labels which model was trained with in order - * @param signatures - * TF v2 signatures in Spark NLP - */ + * XLM-RoBERTa Model wrapper with TensorFlow Wrapper + * @param spp + * XlmRoberta SentencePiece model with SentencePieceWrapper + * @param configProtoBytes + * Configuration for TensorFlow session + * @param tags + * labels which model was trained with in order + * @param signatures + * TF v2 signatures in Spark NLP + */ private[johnsnowlabs] class XlmRoBertaClassification( - val tensorflowWrapper: TensorflowWrapper, - val spp: SentencePieceWrapper, - configProtoBytes: Option[Array[Byte]] = None, - tags: Map[String, Int], - signatures: Option[Map[String, String]] = None, - threshold: Float = 0.5f) - extends Serializable + val tensorflowWrapper: Option[TensorflowWrapper], + val onnxWrapper: Option[OnnxWrapper], + val spp: SentencePieceWrapper, + configProtoBytes: Option[Array[Byte]] = None, + tags: Map[String, Int], + signatures: Option[Map[String, String]] = None, + threshold: Float = 0.5f) + extends Serializable with XXXForClassification { val _tfXlmRoBertaSignatures: Map[String, String] = signatures.getOrElse(ModelSignatureManager.apply()) + val detectedEngine: String = + if (tensorflowWrapper.isDefined) TensorFlow.name + else if (onnxWrapper.isDefined) ONNX.name + else TensorFlow.name + private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions protected val sentenceStartTokenId: Int = 0 protected val sentenceEndTokenId: Int = 2 @@ -58,9 +67,9 @@ private[johnsnowlabs] class XlmRoBertaClassification( protected val sigmoidThreshold: Float = threshold def tokenizeWithAlignment( - sentences: Seq[TokenizedSentence], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + sentences: Seq[TokenizedSentence], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { val encoder = new SentencepieceEncoder(spp, caseSensitive, sentencePieceDelimiterId, pieceIdOffset = 1) @@ -75,9 +84,9 @@ private[johnsnowlabs] class XlmRoBertaClassification( } def tokenizeSeqString( - candidateLabels: Seq[String], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + candidateLabels: Seq[String], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { val basicTokenizer = new BasicTokenizer(caseSensitive) val encoder = @@ -92,9 +101,9 @@ private[johnsnowlabs] class XlmRoBertaClassification( }) } def tokenizeDocument( - docs: Seq[Annotation], - maxSeqLength: Int, - caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { + docs: Seq[Annotation], + maxSeqLength: Int, + caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = { val encoder = new SentencepieceEncoder( @@ -113,52 +122,15 @@ private[johnsnowlabs] class XlmRoBertaClassification( } def tag(batch: Seq[Array[Int]]): Seq[Array[Array[Float]]] = { - val tensors = new TensorResources() val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length - val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) - - // [nb of encoded sentences , maxSentenceLength] - val shape = Array(batch.length.toLong, maxSentenceLength) - - batch.zipWithIndex - .foreach { case (sentence, idx) => - val offset = idx * maxSentenceLength - tokenBuffers.offset(offset).write(sentence) - maskBuffers - .offset(offset) - .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) - } - - val runner = tensorflowWrapper - .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) - .runner - - val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) - val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - - runner - .feed( - _tfXlmRoBertaSignatures - .getOrElse(ModelSignatureConstants.InputIds.key, "missing_input_id_key"), - tokenTensors) - .feed( - _tfXlmRoBertaSignatures - .getOrElse(ModelSignatureConstants.AttentionMask.key, "missing_input_mask_key"), - maskTensors) - .fetch(_tfXlmRoBertaSignatures - .getOrElse(ModelSignatureConstants.LogitsOutput.key, "missing_logits_key")) - - val outs = runner.run().asScala - val rawScores = TensorResources.extractFloats(outs.head) - - outs.foreach(_.close()) - tensors.clearSession(outs) - tensors.clearTensors() - + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch) + case _ => getRawScoresWithTF(batch, maxSentenceLength) + } + println(rawScores.mkString("Array(", ", ", ")")) val dim = rawScores.length / (batchLength * maxSentenceLength) val batchScores: Array[Array[Array[Float]]] = rawScores .grouped(dim) @@ -170,10 +142,9 @@ private[johnsnowlabs] class XlmRoBertaClassification( batchScores } - def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { + private def getRawScoresWithTF(batch: Seq[Array[Int]], maxSentenceLength: Int): Array[Float] = { val tensors = new TensorResources() - val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max val batchLength = batch.length val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) @@ -191,9 +162,11 @@ private[johnsnowlabs] class XlmRoBertaClassification( .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) } - val runner = tensorflowWrapper - .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) - .runner + val session = tensorflowWrapper.get.getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + val runner = session.runner val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) @@ -217,6 +190,51 @@ private[johnsnowlabs] class XlmRoBertaClassification( tensors.clearSession(outs) tensors.clearTensors() + rawScores + } + + private def getRowScoresWithOnnx(batch: Seq[Array[Int]]): Array[Float] = { + + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val results = runner.run(inputs) + try { + val embeddings = results + .get("logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + tokenTensors.close() + maskTensors.close() + + embeddings + } finally if (results != null) results.close() + } + } + + def tagSequence(batch: Seq[Array[Int]], activation: String): Array[Array[Float]] = { + val batchLength = batch.length + val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + + val rawScores = detectedEngine match { + case ONNX.name => getRowScoresWithOnnx(batch) + case _ => getRawScoresWithTF(batch, maxSentenceLength) + } + + val dim = rawScores.length / batchLength val batchScores: Array[Array[Float]] = rawScores @@ -233,10 +251,10 @@ private[johnsnowlabs] class XlmRoBertaClassification( } def tagZeroShotSequence( - batch: Seq[Array[Int]], - entailmentId: Int, - contradictionId: Int, - activation: String): Array[Array[Float]] = { + batch: Seq[Array[Int]], + entailmentId: Int, + contradictionId: Int, + activation: String): Array[Array[Float]] = { val tensors = new TensorResources() val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max @@ -266,7 +284,7 @@ private[johnsnowlabs] class XlmRoBertaClassification( .toArray) } - val session = tensorflowWrapper.getTFSessionWithSignature( + val session = tensorflowWrapper.get.getTFSessionWithSignature( configProtoBytes = configProtoBytes, savedSignatures = signatures, initAllTables = false) @@ -274,7 +292,6 @@ private[johnsnowlabs] class XlmRoBertaClassification( val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) - val segmentTensors = tensors.createIntBufferTensor(shape, segmentBuffers) runner .feed( @@ -303,10 +320,29 @@ private[johnsnowlabs] class XlmRoBertaClassification( } def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = { - val tensors = new TensorResources() - + val batchLength = batch.length val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max + val (startLogits, endLogits) = detectedEngine match { + case ONNX.name => computeLogitsWithOnnx(batch) + case _ => computeLogitsWithTF(batch, maxSentenceLength) + } + + val endDim = endLogits.length / batchLength + val endScores: Array[Array[Float]] = + endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray + + val startDim = startLogits.length / batchLength + val startScores: Array[Array[Float]] = + startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + + (startScores, endScores) + } + + private def computeLogitsWithTF( + batch: Seq[Array[Int]], + maxSentenceLength: Int): (Array[Float], Array[Float]) = { val batchLength = batch.length + val tensors = new TensorResources() val tokenBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) val maskBuffers: IntDataBuffer = tensors.createIntBuffer(batchLength * maxSentenceLength) @@ -323,9 +359,11 @@ private[johnsnowlabs] class XlmRoBertaClassification( .write(sentence.map(x => if (x == sentencePadTokenId) 0 else 1)) } - val runner = tensorflowWrapper - .getTFSessionWithSignature(configProtoBytes = configProtoBytes, initAllTables = false) - .runner + val session = tensorflowWrapper.get.getTFSessionWithSignature( + configProtoBytes = configProtoBytes, + savedSignatures = signatures, + initAllTables = false) + val runner = session.runner val tokenTensors = tensors.createIntBufferTensor(shape, tokenBuffers) val maskTensors = tensors.createIntBufferTensor(shape, maskBuffers) @@ -352,21 +390,53 @@ private[johnsnowlabs] class XlmRoBertaClassification( tensors.clearSession(outs) tensors.clearTensors() - val endDim = endLogits.length / batchLength - val endScores: Array[Array[Float]] = - endLogits.grouped(endDim).map(scores => calculateSoftmax(scores)).toArray - - val startDim = startLogits.length / batchLength - val startScores: Array[Array[Float]] = - startLogits.grouped(startDim).map(scores => calculateSoftmax(scores)).toArray + (startLogits, endLogits) + } - (startScores, endScores) + private def computeLogitsWithOnnx(batch: Seq[Array[Int]]): (Array[Float], Array[Float]) = { + // [nb of encoded sentences , maxSentenceLength] + val (runner, env) = onnxWrapper.get.getSession(onnxSessionOptions) + + val tokenTensors = + OnnxTensor.createTensor(env, batch.map(x => x.map(x => x.toLong)).toArray) + val maskTensors = + OnnxTensor.createTensor( + env, + batch.map(sentence => sentence.map(x => if (x == 0L) 0L else 1L)).toArray) + + val inputs = + Map("input_ids" -> tokenTensors, "attention_mask" -> maskTensors).asJava + + try { + val output = runner.run(inputs) + try { + val startLogits = output + .get("start_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + val endLogits = output + .get("end_logits") + .get() + .asInstanceOf[OnnxTensor] + .getFloatBuffer + .array() + + tokenTensors.close() + maskTensors.close() + + (startLogits.slice(1, startLogits.length), endLogits.slice(1, endLogits.length)) + } finally if (output != null) output.close() + } } + def findIndexedToken( - tokenizedSentences: Seq[TokenizedSentence], - sentence: (WordpieceTokenizedSentence, Int), - tokenPiece: TokenPiece): Option[IndexedToken] = { + tokenizedSentences: Seq[TokenizedSentence], + sentence: (WordpieceTokenizedSentence, Int), + tokenPiece: TokenPiece): Option[IndexedToken] = { tokenizedSentences(sentence._2).indexedTokens.find(p => p.begin == tokenPiece.begin && tokenPiece.isWordStart) } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala index 01920477d5a672..f8a85cb1e44dca 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForQuestionAnswering.scala @@ -17,18 +17,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.{MergeTokenStrategy, XlmRoBertaClassification} +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.serialization.MapFeature import org.apache.spark.broadcast.Broadcast @@ -116,6 +109,7 @@ class XlmRoBertaForQuestionAnswering(override val uid: String) extends AnnotatorModel[XlmRoBertaForQuestionAnswering] with HasBatchedAnnotate[XlmRoBertaForQuestionAnswering] with WriteTensorflowModel + with WriteOnnxModel with WriteSentencePieceModel with HasCaseSensitiveProperties with HasEngine { @@ -196,13 +190,15 @@ class XlmRoBertaForQuestionAnswering(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): XlmRoBertaForQuestionAnswering = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new XlmRoBertaClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = Map.empty[String, Int], @@ -253,19 +249,25 @@ class XlmRoBertaForQuestionAnswering(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_xlm_roberta_classification", - XlmRoBertaForQuestionAnswering.tfFile, - configProtoBytes = getConfigProtoBytes) - writeSentencePieceModel( - path, - spark, - getModelIfNotSet.spp, - "_xlmroberta", - XlmRoBertaForQuestionAnswering.sppFile) + val suffix = "_xlm_roberta_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + XlmRoBertaForQuestionAnswering.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + XlmRoBertaForQuestionAnswering.onnxFile) + } } } @@ -291,11 +293,13 @@ trait ReadablePretrainedXlmRoBertaForQAModel } trait ReadXlmRoBertaForQuestionAnsweringDLModel - extends ReadTensorflowModel + extends ReadTensorflowModel + with ReadOnnxModel with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[XlmRoBertaForQuestionAnswering] => - override val tfFile: String = "xlm_roberta_classification_tensorflow" + override val tfFile: String = "xlm_roberta_classification_tf" + override val onnxFile: String = "xlm_roberta_classification_onnx" override val sppFile: String = "xlmroberta_spp" def readModel( @@ -303,10 +307,25 @@ trait ReadXlmRoBertaForQuestionAnsweringDLModel path: String, spark: SparkSession): Unit = { - val tf = - readTensorflowModel(path, spark, "_xlm_roberta_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = + readTensorflowModel(path, spark, "xlm_roberta_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "xlm_roberta_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -324,7 +343,7 @@ trait ReadXlmRoBertaForQuestionAnsweringDLModel detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -337,7 +356,12 @@ trait ReadXlmRoBertaForQuestionAnsweringDLModel */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) @@ -351,5 +375,6 @@ trait ReadXlmRoBertaForQuestionAnsweringDLModel * for the documentation. */ object XlmRoBertaForQuestionAnswering - extends ReadablePretrainedXlmRoBertaForQAModel + extends ReadablePretrainedXlmRoBertaForQAModel with ReadXlmRoBertaForQuestionAnsweringDLModel + diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala index add55d9270b8be..8adbedba232274 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForSequenceClassification.scala @@ -16,20 +16,12 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl -import com.johnsnowlabs.ml.ai.XlmRoBertaClassification +import com.johnsnowlabs.ml.ai.{MergeTokenStrategy, XlmRoBertaClassification} +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - loadTextAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -123,6 +115,7 @@ import org.apache.spark.sql.SparkSession class XlmRoBertaForSequenceClassification(override val uid: String) extends AnnotatorModel[XlmRoBertaForSequenceClassification] with HasBatchedAnnotate[XlmRoBertaForSequenceClassification] + with WriteOnnxModel with WriteTensorflowModel with WriteSentencePieceModel with HasCaseSensitiveProperties @@ -238,13 +231,15 @@ class XlmRoBertaForSequenceClassification(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): XlmRoBertaForSequenceClassification = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new XlmRoBertaClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -304,19 +299,25 @@ class XlmRoBertaForSequenceClassification(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_xlm_roberta_classification", - XlmRoBertaForSequenceClassification.tfFile, - configProtoBytes = getConfigProtoBytes) - writeSentencePieceModel( - path, - spark, - getModelIfNotSet.spp, - "_xlmroberta", - XlmRoBertaForSequenceClassification.sppFile) + val suffix = "_xlm_roberta_classification" + + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + XlmRoBertaForSequenceClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + XlmRoBertaForSequenceClassification.onnxFile) + } } } @@ -341,10 +342,14 @@ trait ReadablePretrainedXlmRoBertaForSequenceModel super.pretrained(name, lang, remoteLoc) } -trait ReadXlmRoBertaForSequenceDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadXlmRoBertaForSequenceDLModel + extends ReadTensorflowModel + with ReadOnnxModel + with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[XlmRoBertaForSequenceClassification] => - override val tfFile: String = "xlm_roberta_classification_tensorflow" + override val tfFile: String = "xlm_roberta_classification_tf" + override val onnxFile: String = "xlm_roberta_classification_onnx" override val sppFile: String = "xlmroberta_spp" def readModel( @@ -352,10 +357,25 @@ trait ReadXlmRoBertaForSequenceDLModel extends ReadTensorflowModel with ReadSent path: String, spark: SparkSession): Unit = { - val tf = - readTensorflowModel(path, spark, "_xlm_roberta_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = + readTensorflowModel(path, spark, "xlm_roberta_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "xlm_roberta_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -376,7 +396,7 @@ trait ReadXlmRoBertaForSequenceDLModel extends ReadTensorflowModel with ReadSent detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -389,7 +409,12 @@ trait ReadXlmRoBertaForSequenceDLModel extends ReadTensorflowModel with ReadSent */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala index ded252b097d481..661417eb201cc3 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForTokenClassification.scala @@ -17,19 +17,11 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.XlmRoBertaClassification +import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel} import com.johnsnowlabs.ml.tensorflow._ -import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ - ReadSentencePieceModel, - SentencePieceWrapper, - WriteSentencePieceModel -} -import com.johnsnowlabs.ml.util.LoadExternalModel.{ - loadSentencePieceAsset, - loadTextAsset, - modelSanityCheck, - notSupportedEngineError -} -import com.johnsnowlabs.ml.util.TensorFlow +import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ReadSentencePieceModel, SentencePieceWrapper, WriteSentencePieceModel} +import com.johnsnowlabs.ml.util.LoadExternalModel.{loadSentencePieceAsset, loadTextAsset, modelSanityCheck, notSupportedEngineError} +import com.johnsnowlabs.ml.util.{ONNX, TensorFlow} import com.johnsnowlabs.nlp._ import com.johnsnowlabs.nlp.annotators.common._ import com.johnsnowlabs.nlp.serialization.MapFeature @@ -122,6 +114,7 @@ import org.apache.spark.sql.SparkSession class XlmRoBertaForTokenClassification(override val uid: String) extends AnnotatorModel[XlmRoBertaForTokenClassification] with HasBatchedAnnotate[XlmRoBertaForTokenClassification] + with WriteOnnxModel with WriteTensorflowModel with WriteSentencePieceModel with HasCaseSensitiveProperties @@ -217,13 +210,15 @@ class XlmRoBertaForTokenClassification(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): XlmRoBertaForTokenClassification = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new XlmRoBertaClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -276,21 +271,26 @@ class XlmRoBertaForTokenClassification(override val uid: String) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) - writeTensorflowModelV2( - path, - spark, - getModelIfNotSet.tensorflowWrapper, - "_xlm_roberta_classification", - XlmRoBertaForTokenClassification.tfFile, - configProtoBytes = getConfigProtoBytes) - writeSentencePieceModel( - path, - spark, - getModelIfNotSet.spp, - "_xlmroberta", - XlmRoBertaForTokenClassification.sppFile) - } + val suffix = "_xlm_roberta_classification" + getEngine match { + case TensorFlow.name => + writeTensorflowModelV2( + path, + spark, + getModelIfNotSet.tensorflowWrapper.get, + suffix, + XlmRoBertaForTokenClassification.tfFile, + configProtoBytes = getConfigProtoBytes) + case ONNX.name => + writeOnnxModel( + path, + spark, + getModelIfNotSet.onnxWrapper.get, + suffix, + XlmRoBertaForTokenClassification.onnxFile) + } + } } trait ReadablePretrainedXlmRoBertaForTokenModel @@ -313,10 +313,14 @@ trait ReadablePretrainedXlmRoBertaForTokenModel super.pretrained(name, lang, remoteLoc) } -trait ReadXlmRoBertaForTokenDLModel extends ReadTensorflowModel with ReadSentencePieceModel { +trait ReadXlmRoBertaForTokenDLModel + extends ReadTensorflowModel + with ReadOnnxModel + with ReadSentencePieceModel { this: ParamsAndFeaturesReadable[XlmRoBertaForTokenClassification] => - override val tfFile: String = "xlm_roberta_classification_tensorflow" + override val tfFile: String = "xlm_roberta_classification_tf" + override val onnxFile: String = "xlm_roberta_classification_onnx" override val sppFile: String = "xlmroberta_spp" def readModel( @@ -324,10 +328,26 @@ trait ReadXlmRoBertaForTokenDLModel extends ReadTensorflowModel with ReadSentenc path: String, spark: SparkSession): Unit = { - val tf = - readTensorflowModel(path, spark, "_xlm_roberta_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + + instance.getEngine match { + case TensorFlow.name => + val tfWrapper = + readTensorflowModel(path, spark, "xlm_roberta_classification_tf", initAllTables = false) + instance.setModelIfNotSet(spark, Some(tfWrapper), None, spp) + case ONNX.name => + val onnxWrapper = + readOnnxModel( + path, + spark, + "xlm_roberta_classification_onnx", + zipped = true, + useBundle = false, + None) + instance.setModelIfNotSet(spark, None, Some(onnxWrapper), spp) + case _ => + throw new Exception(notSupportedEngineError) + } } addReader(readModel) @@ -338,7 +358,6 @@ trait ReadXlmRoBertaForTokenDLModel extends ReadTensorflowModel with ReadSentenc val spModel = loadSentencePieceAsset(localModelPath, "sentencepiece.bpe.model") val labels = loadTextAsset(localModelPath, "labels.txt").zipWithIndex.toMap - val annotatorModel = new XlmRoBertaForTokenClassification() .setLabels(labels) @@ -346,7 +365,7 @@ trait ReadXlmRoBertaForTokenDLModel extends ReadTensorflowModel with ReadSentenc detectedEngine match { case TensorFlow.name => - val (wrapper, signatures) = + val (tfWrapper, signatures) = TensorflowWrapper.read(localModelPath, zipped = false, useBundle = true) val _signatures = signatures match { @@ -359,7 +378,12 @@ trait ReadXlmRoBertaForTokenDLModel extends ReadTensorflowModel with ReadSentenc */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(tfWrapper), None, spModel) + + case ONNX.name => + val onnxWrapper = OnnxWrapper.read(localModelPath, zipped = false, useBundle = true) + annotatorModel + .setModelIfNotSet(spark, None, Some(onnxWrapper), spModel) case _ => throw new Exception(notSupportedEngineError) diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala index 981a3ff73f77fa..0cc16109726193 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/classifier/dl/XlmRoBertaForZeroShotClassification.scala @@ -17,6 +17,7 @@ package com.johnsnowlabs.nlp.annotators.classifier.dl import com.johnsnowlabs.ml.ai.XlmRoBertaClassification +import com.johnsnowlabs.ml.onnx.OnnxWrapper import com.johnsnowlabs.ml.tensorflow._ import com.johnsnowlabs.ml.tensorflow.sentencepiece.{ ReadSentencePieceModel, @@ -250,13 +251,15 @@ class XlmRoBertaForZeroShotClassification(override val uid: String) /** @group setParam */ def setModelIfNotSet( spark: SparkSession, - tensorflowWrapper: TensorflowWrapper, + tensorflowWrapper: Option[TensorflowWrapper], + onnxWrapper: Option[OnnxWrapper], spp: SentencePieceWrapper): XlmRoBertaForZeroShotClassification = { if (_model.isEmpty) { _model = Some( spark.sparkContext.broadcast( new XlmRoBertaClassification( tensorflowWrapper, + onnxWrapper, spp, configProtoBytes = getConfigProtoBytes, tags = $$(labels), @@ -322,7 +325,7 @@ class XlmRoBertaForZeroShotClassification(override val uid: String) writeTensorflowModelV2( path, spark, - getModelIfNotSet.tensorflowWrapper, + getModelIfNotSet.tensorflowWrapper.get, "_xlmroberta_classification", XlmRoBertaForZeroShotClassification.tfFile, configProtoBytes = getConfigProtoBytes) @@ -373,7 +376,7 @@ trait ReadXlmRoBertaForZeroShotDLModel extends ReadTensorflowModel with ReadSent val tf = readTensorflowModel(path, spark, "_xlmroberta_classification_tf", initAllTables = false) val spp = readSentencePieceModel(path, spark, "_xlmroberta_spp", sppFile) - instance.setModelIfNotSet(spark, tf, spp) + instance.setModelIfNotSet(spark, Some(tf), None, spp) } addReader(readModel) @@ -429,7 +432,7 @@ trait ReadXlmRoBertaForZeroShotDLModel extends ReadTensorflowModel with ReadSent */ annotatorModel .setSignatures(_signatures) - .setModelIfNotSet(spark, wrapper, spModel) + .setModelIfNotSet(spark, Some(wrapper), None, spModel) case _ => throw new Exception(notSupportedEngineError)