[SPARKNLP-890] ONNX E5 MPnet example (#13958)

JohnSnowLabs · Sep 7, 2023 · 637f007 · 637f007
1 parent a802ab8
commit 637f007
Show file tree

Hide file tree

Showing 2 changed files with 2,921 additions and 0 deletions.
diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb
@@ -0,0 +1,388 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_E5.ipynb)\n",
+    "\n",
+    "# Import ONNX E5 models from HuggingFace 🤗 into Spark NLP 🚀\n",
+    "\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "- ONNX support for this annotator was introduced in  `Spark NLP 5.1.0`, enabling high performance inference for models. Please make sure you have upgraded to the latest Spark NLP release.\n",
+    "- You can import models for E5 from HuggingFace and they have to be in `Sentence Similarity` category. Meaning, you cannot use E5 models trained/fine-tuned on a specific task such as token/sequence classification."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Export and Save HuggingFace model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n",
+    "- We lock `transformers` on version `4.29.1`. This doesn't mean it won't work with the future releases, but we wanted you to know which versions have been tested successfully."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.1/7.1 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.6/380.6 kB\u001b[0m \u001b[31m22.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m21.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m40.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 kB\u001b[0m \u001b[31m7.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m454.7/454.7 kB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.9/5.9 MB\u001b[0m \u001b[31m54.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.7/212.7 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m39.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.6/14.6 MB\u001b[0m \u001b[31m59.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.5/55.5 kB\u001b[0m \u001b[31m3.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m46.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m37.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\n",
+      "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.20.2 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -q --upgrade transformers[onnx]==4.29.1 optimum"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n",
+    "- We'll use [intfloat/e5-small-v2](https://huggingface.co/intfloat/e5-small-v2) model from HuggingFace as an example and load it as a `ORTModelForFeatureExtraction`, representing an ONNX model.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Framework not specified. Using pt to export to ONNX.\n",
+      "Using framework PyTorch: 2.0.1+cu118\n",
+      "Overriding 1 configuration item(s)\n",
+      "\t- use_cache -> False\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============= Diagnostic Run torch.onnx.export version 2.0.1+cu118 =============\n",
+      "verbose: False, log level: Level.ERROR\n",
+      "======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from optimum.onnxruntime import ORTModelForFeatureExtraction\n",
+    "\n",
+    "MODEL_NAME = \"intfloat/e5-small-v2\"\n",
+    "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\"\n",
+    "\n",
+    "ort_model = ORTModelForFeatureExtraction.from_pretrained(MODEL_NAME, export=True)\n",
+    "\n",
+    "# Save the ONNX model\n",
+    "ort_model.save_pretrained(EXPORT_PATH)\n",
+    "\n",
+    "# Create directory for assets and move the tokenizer files.\n",
+    "# A separate folder is needed for Spark NLP.\n",
+    "!mkdir {EXPORT_PATH}/assets\n",
+    "!mv {EXPORT_PATH}/vocab.txt {EXPORT_PATH}/assets/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's have a look inside these two directories and see what we are dealing with:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 130692\n",
+      "drwxr-xr-x 2 root root      4096 Sep  5 09:03 assets\n",
+      "-rw-r--r-- 1 root root       626 Sep  5 09:03 config.json\n",
+      "-rw-r--r-- 1 root root 133093467 Sep  5 09:03 model.onnx\n",
+      "-rw-r--r-- 1 root root       125 Sep  5 09:03 special_tokens_map.json\n",
+      "-rw-r--r-- 1 root root       314 Sep  5 09:03 tokenizer_config.json\n",
+      "-rw-r--r-- 1 root root    711396 Sep  5 09:03 tokenizer.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -l {EXPORT_PATH}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 228\n",
+      "-rw-r--r-- 1 root root 231508 Sep  5 09:03 vocab.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls -l {EXPORT_PATH}/assets"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import and Save E5 in Spark NLP\n",
+    "\n",
+    "- Let's install and setup Spark NLP in Google Colab\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Installing PySpark 3.2.3 and Spark NLP 5.1.0\n",
+      "setup Colab for PySpark 3.2.3 and Spark NLP 5.1.0\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m531.2/531.2 kB\u001b[0m \u001b[31m39.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+     ]
+    }
+   ],
+   "source": [
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start Spark with Spark NLP included via our simple `start()` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sparknlp\n",
+    "# let's start Spark with Spark NLP\n",
+    "spark = sparknlp.start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's use `loadSavedModel` functon in `E5Embeddings` which allows us to load the ONNX model\n",
+    "- Most params will be set automatically. They can also be set later after loading the model in `E5Embeddings` during runtime, so don't worry about setting them now\n",
+    "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n",
+    "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sparknlp.annotator import *\n",
+    "\n",
+    "# All these params should be identical to the original ONNX model\n",
+    "E5 = E5Embeddings.loadSavedModel(f\"{EXPORT_PATH}\", spark)\\\n",
+    "    .setInputCols([\"document\"])\\\n",
+    "    .setOutputCol(\"E5\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "E5.write().overwrite().save(f\"{MODEL_NAME}_spark_nlp\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's clean up stuff we don't need anymore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf {EXPORT_PATH}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Awesome  😎 !\n",
+    "\n",
+    "This is your ONNX E5 model from HuggingFace 🤗  loaded and saved by Spark NLP 🚀"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 130008\n",
+      "-rw-r--r-- 1 root root 133113905 Sep  5 08:57 e5_onnx\n",
+      "drwxr-xr-x 3 root root      4096 Sep  5 08:57 fields\n",
+      "drwxr-xr-x 2 root root      4096 Sep  5 08:57 metadata\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls -l {MODEL_NAME}_spark_nlp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny E5 model 😊"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sparknlp\n",
+    "\n",
+    "from sparknlp.base import *\n",
+    "from sparknlp.annotator import *\n",
+    "\n",
+    "document_assembler = DocumentAssembler()\\\n",
+    "    .setInputCol(\"text\")\\\n",
+    "    .setOutputCol(\"document\")\n",
+    "\n",
+    "E5_loaded = E5Embeddings.load(f\"{MODEL_NAME}_spark_nlp\")\\\n",
+    "    .setInputCols([\"document\"])\\\n",
+    "    .setOutputCol(\"E5\")\\\n",
+    "\n",
+    "pipeline = Pipeline(\n",
+    "    stages = [\n",
+    "        document_assembler,\n",
+    "        E5_loaded\n",
+    "  ])\n",
+    "\n",
+    "data = spark.createDataFrame([['William Henry Gates III (born October 28, 1955) is an American business magnate, software developer, investor,and philanthropist.']]).toDF(\"text\")\n",
+    "model = pipeline.fit(data)\n",
+    "result = model.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+\n",
+      "|          embeddings|\n",
+      "+--------------------+\n",
+      "|[-0.35357836, 0.3...|\n",
+      "+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "result.selectExpr(\"explode(E5.embeddings) as embeddings\").show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That's it! You can now go wild and use hundreds of E5 models from HuggingFace 🤗 in Spark NLP 🚀\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}