-
Notifications
You must be signed in to change notification settings - Fork 717
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding colab notebook for M2M100 (#14191)
- Loading branch information
1 parent
901c884
commit 19367f2
Showing
1 changed file
with
257 additions
and
0 deletions.
There are no files selected for viewing
257 changes: 257 additions & 0 deletions
257
...n/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,257 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "rAklGDKcIxuT" | ||
}, | ||
"source": [ | ||
"![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", | ||
"\n", | ||
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/tree/master/examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Multilingual Translation with M2M100" | ||
], | ||
"metadata": { | ||
"id": "_b6aR_k6Oi9Q" | ||
} | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "uI7yhCibA9cf" | ||
}, | ||
"source": [ | ||
"## Colab Setup" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"id": "4WQLLrIUA9cg", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"outputId": "87c0ea9b-0f20-4361-fa2e-5eb6f8113507" | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Installing PySpark 3.2.3 and Spark NLP 5.3.0\n", | ||
"setup Colab for PySpark 3.2.3 and Spark NLP 5.3.0\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m564.8/564.8 kB\u001b[0m \u001b[31m59.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m23.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"id": "KzMHa0HdA9ch", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"outputId": "f190dcf5-01bb-49d1-e17a-79a4123c3a13" | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Spark NLP version 5.3.0\n", | ||
"Apache Spark version: 3.2.3\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import sparknlp\n", | ||
"from sparknlp.base import *\n", | ||
"from sparknlp.annotator import *\n", | ||
"from pyspark.ml import Pipeline\n", | ||
"\n", | ||
"spark = sparknlp.start()\n", | ||
"\n", | ||
"print(\"Spark NLP version\", sparknlp.version())\n", | ||
"print(\"Apache Spark version:\", spark.version)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "1zvc_5Y1Ixuk" | ||
}, | ||
"source": [ | ||
"# Define Spark NLP pipeline" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "2Y9GpdJhXIpD" | ||
}, | ||
"source": [ | ||
"**A sample text in Chinese - we'll translate it to English**" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"id": "POzRV_wSK3YF" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"text = \"\"\"除了是北方之王之外,约翰·斯诺还是一位英国医生,也是麻醉和医疗卫生发展的领导者。 他被认为是第一个利用数据治愈 1854 年霍乱爆发的人。\"\"\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": { | ||
"id": "0UNHPnccIxul", | ||
"outputId": "d0f00dcc-d7bc-480d-9e82-e924eace7d1a", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
} | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"m2m100_418M download started this may take some time.\n", | ||
"Approximate size to download 2.8 GB\n", | ||
"[OK!]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"documentAssembler = DocumentAssembler()\\\n", | ||
" .setInputCol(\"text\")\\\n", | ||
" .setOutputCol(\"document\")\n", | ||
"\n", | ||
"m2m100 = M2M100Transformer.pretrained() \\\n", | ||
" .setInputCols([\"document\"]) \\\n", | ||
" .setMaxOutputLength(50) \\\n", | ||
" .setOutputCol(\"generation\") \\\n", | ||
" .setSrcLang(\"zh\") \\\n", | ||
" .setTgtLang(\"en\")\n", | ||
"\n", | ||
"tl_pipeline = Pipeline(\n", | ||
" stages=[documentAssembler, m2m100]\n", | ||
" )" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"# Light Pipeline version" | ||
], | ||
"metadata": { | ||
"id": "v4QtpYx2SK7y" | ||
} | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"Let's create the light Pipiline" | ||
], | ||
"metadata": { | ||
"id": "XpcordqsSmE3" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"empty_df = spark.createDataFrame([[\"\"]]).toDF('text')\n", | ||
"pipeline_model = tl_pipeline.fit(empty_df)\n", | ||
"model = LightPipeline(pipeline_model)\n", | ||
"res = model.fullAnnotate(text)" | ||
], | ||
"metadata": { | ||
"id": "CoQAJpOxQ1zb" | ||
}, | ||
"execution_count": 7, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"source": [ | ||
"visualize the results" | ||
], | ||
"metadata": { | ||
"id": "t-L_JARWSszu" | ||
} | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"print ('Original:', text, '\\n\\n')\n", | ||
"\n", | ||
"print ('Translated:\\n')\n", | ||
"for sentence in res[0]['generation']:\n", | ||
" print (sentence.result)" | ||
], | ||
"metadata": { | ||
"id": "ZnD-QGuJREW8", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"outputId": "51258f55-1ec2-4e86-f312-1e057dbec5b8" | ||
}, | ||
"execution_count": 16, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"Original: 除了是北方之王之外,约翰·斯诺还是一位英国医生,也是麻醉和医疗卫生发展的领导者。 他被认为是第一个利用数据治愈 1854 年霍乱爆发的人。 \n", | ||
"\n", | ||
"\n", | ||
"Translated:\n", | ||
"\n", | ||
"In addition to being the King of the North, John Snow was also a British doctor and a leader in the development of anesthesia and health care. he was considered the first person to use data to cure the 1854 cholera outbreak.\n" | ||
] | ||
} | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |