adding colab notebook for M2M100 (#14191)

JohnSnowLabs · Mar 3, 2024 · 19367f2 · 19367f2
1 parent 901c884
commit 19367f2
Showing 1 changed file with 257 additions and 0 deletions.
diff --git a/...n/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb b/...n/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb
@@ -0,0 +1,257 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "rAklGDKcIxuT"
+      },
+      "source": [
+        "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
+        "\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Multilingual Translation with M2M100"
+      ],
+      "metadata": {
+        "id": "_b6aR_k6Oi9Q"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uI7yhCibA9cf"
+      },
+      "source": [
+        "## Colab Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "4WQLLrIUA9cg",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "87c0ea9b-0f20-4361-fa2e-5eb6f8113507"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Installing PySpark 3.2.3 and Spark NLP 5.3.0\n",
+            "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.0\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m564.8/564.8 kB\u001b[0m \u001b[31m59.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+          ]
+        }
+      ],
+      "source": [
+        "!wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "KzMHa0HdA9ch",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "f190dcf5-01bb-49d1-e17a-79a4123c3a13"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Spark NLP version 5.3.0\n",
+            "Apache Spark version: 3.2.3\n"
+          ]
+        }
+      ],
+      "source": [
+        "import sparknlp\n",
+        "from sparknlp.base import *\n",
+        "from sparknlp.annotator import *\n",
+        "from pyspark.ml import Pipeline\n",
+        "\n",
+        "spark = sparknlp.start()\n",
+        "\n",
+        "print(\"Spark NLP version\", sparknlp.version())\n",
+        "print(\"Apache Spark version:\", spark.version)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "1zvc_5Y1Ixuk"
+      },
+      "source": [
+        "# Define Spark NLP pipeline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2Y9GpdJhXIpD"
+      },
+      "source": [
+        "**A sample text in Chinese - we'll translate it to English**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "POzRV_wSK3YF"
+      },
+      "outputs": [],
+      "source": [
+        "text = \"\"\"除了是北方之王之外，约翰·斯诺还是一位英国医生，也是麻醉和医疗卫生发展的领导者。 他被认为是第一个利用数据治愈 1854 年霍乱爆发的人。\"\"\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "0UNHPnccIxul",
+        "outputId": "d0f00dcc-d7bc-480d-9e82-e924eace7d1a",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "m2m100_418M download started this may take some time.\n",
+            "Approximate size to download 2.8 GB\n",
+            "[OK!]\n"
+          ]
+        }
+      ],
+      "source": [
+        "documentAssembler = DocumentAssembler()\\\n",
+        "    .setInputCol(\"text\")\\\n",
+        "    .setOutputCol(\"document\")\n",
+        "\n",
+        "m2m100 = M2M100Transformer.pretrained() \\\n",
+        "    .setInputCols([\"document\"]) \\\n",
+        "    .setMaxOutputLength(50) \\\n",
+        "    .setOutputCol(\"generation\") \\\n",
+        "    .setSrcLang(\"zh\") \\\n",
+        "    .setTgtLang(\"en\")\n",
+        "\n",
+        "tl_pipeline = Pipeline(\n",
+        "    stages=[documentAssembler, m2m100]\n",
+        "    )"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Light Pipeline version"
+      ],
+      "metadata": {
+        "id": "v4QtpYx2SK7y"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's create the light Pipiline"
+      ],
+      "metadata": {
+        "id": "XpcordqsSmE3"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "empty_df = spark.createDataFrame([[\"\"]]).toDF('text')\n",
+        "pipeline_model = tl_pipeline.fit(empty_df)\n",
+        "model = LightPipeline(pipeline_model)\n",
+        "res = model.fullAnnotate(text)"
+      ],
+      "metadata": {
+        "id": "CoQAJpOxQ1zb"
+      },
+      "execution_count": 7,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "visualize the results"
+      ],
+      "metadata": {
+        "id": "t-L_JARWSszu"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print ('Original:', text, '\\n\\n')\n",
+        "\n",
+        "print ('Translated:\\n')\n",
+        "for sentence in res[0]['generation']:\n",
+        "  print (sentence.result)"
+      ],
+      "metadata": {
+        "id": "ZnD-QGuJREW8",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "51258f55-1ec2-4e86-f312-1e057dbec5b8"
+      },
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Original: 除了是北方之王之外，约翰·斯诺还是一位英国医生，也是麻醉和医疗卫生发展的领导者。 他被认为是第一个利用数据治愈 1854 年霍乱爆发的人。 \n",
+            "\n",
+            "\n",
+            "Translated:\n",
+            "\n",
+            "In addition to being the King of the North, John Snow was also a British doctor and a leader in the development of anesthesia and health care. he was considered the first person to use data to cure the 1854 cholera outbreak.\n"
+          ]
+        }
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}