diff --git a/examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb b/examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb new file mode 100644 index 00000000000000..48d94f53211b24 --- /dev/null +++ b/examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb @@ -0,0 +1,257 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "rAklGDKcIxuT" + }, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Multilingual Translation with M2M100" + ], + "metadata": { + "id": "_b6aR_k6Oi9Q" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uI7yhCibA9cf" + }, + "source": [ + "## Colab Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "4WQLLrIUA9cg", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "87c0ea9b-0f20-4361-fa2e-5eb6f8113507" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Installing PySpark 3.2.3 and Spark NLP 5.3.0\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 5.3.0\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m564.8/564.8 kB\u001b[0m \u001b[31m59.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "KzMHa0HdA9ch", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f190dcf5-01bb-49d1-e17a-79a4123c3a13" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Spark NLP version 5.3.0\n", + "Apache Spark version: 3.2.3\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "spark = sparknlp.start()\n", + "\n", + "print(\"Spark NLP version\", sparknlp.version())\n", + "print(\"Apache Spark version:\", spark.version)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1zvc_5Y1Ixuk" + }, + "source": [ + "# Define Spark NLP pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2Y9GpdJhXIpD" + }, + "source": [ + "**A sample text in Chinese - we'll translate it to English**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "POzRV_wSK3YF" + }, + "outputs": [], + "source": [ + "text = \"\"\"除了是北方之王之外,约翰·斯诺还是一位英国医生,也是麻醉和医疗卫生发展的领导者。 他被认为是第一个利用数据治愈 1854 年霍乱爆发的人。\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "0UNHPnccIxul", + "outputId": "d0f00dcc-d7bc-480d-9e82-e924eace7d1a", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "m2m100_418M download started this may take some time.\n", + "Approximate size to download 2.8 GB\n", + "[OK!]\n" + ] + } + ], + "source": [ + "documentAssembler = DocumentAssembler()\\\n", + " .setInputCol(\"text\")\\\n", + " .setOutputCol(\"document\")\n", + "\n", + "m2m100 = M2M100Transformer.pretrained() \\\n", + " .setInputCols([\"document\"]) \\\n", + " .setMaxOutputLength(50) \\\n", + " .setOutputCol(\"generation\") \\\n", + " .setSrcLang(\"zh\") \\\n", + " .setTgtLang(\"en\")\n", + "\n", + "tl_pipeline = Pipeline(\n", + " stages=[documentAssembler, m2m100]\n", + " )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Light Pipeline version" + ], + "metadata": { + "id": "v4QtpYx2SK7y" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's create the light Pipiline" + ], + "metadata": { + "id": "XpcordqsSmE3" + } + }, + { + "cell_type": "code", + "source": [ + "empty_df = spark.createDataFrame([[\"\"]]).toDF('text')\n", + "pipeline_model = tl_pipeline.fit(empty_df)\n", + "model = LightPipeline(pipeline_model)\n", + "res = model.fullAnnotate(text)" + ], + "metadata": { + "id": "CoQAJpOxQ1zb" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "visualize the results" + ], + "metadata": { + "id": "t-L_JARWSszu" + } + }, + { + "cell_type": "code", + "source": [ + "print ('Original:', text, '\\n\\n')\n", + "\n", + "print ('Translated:\\n')\n", + "for sentence in res[0]['generation']:\n", + " print (sentence.result)" + ], + "metadata": { + "id": "ZnD-QGuJREW8", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "51258f55-1ec2-4e86-f312-1e057dbec5b8" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Original: 除了是北方之王之外,约翰·斯诺还是一位英国医生,也是麻醉和医疗卫生发展的领导者。 他被认为是第一个利用数据治愈 1854 年霍乱爆发的人。 \n", + "\n", + "\n", + "Translated:\n", + "\n", + "In addition to being the King of the North, John Snow was also a British doctor and a leader in the development of anesthesia and health care. he was considered the first person to use data to cure the 1854 cholera outbreak.\n" + ] + } + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file