-
Notifications
You must be signed in to change notification settings - Fork 717
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Uploading missing notebooks from Spark NLP v 5.1.4 (#14196)
* example notebook for DocumentCharacterTextSplitter * example notebook for DeBertaForZeroShotClassification * example notebooks for BGEEmbeddings and MPNetEmbeddings * example notebook for MPNetForQuestionAnswering * example notebook + path for MPNetForSequenceClassification * Delete examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb * Add files via upload * Delete examples/python/annotation/text/english/language-translation/Multilingual_Translation_with_M2M100.ipynb * fixing colab link for M2M100 notebook
- Loading branch information
1 parent
6b181a6
commit d4b4383
Showing
7 changed files
with
2,340 additions
and
42 deletions.
There are no files selected for viewing
379 changes: 379 additions & 0 deletions
379
examples/python/annotation/text/english/DocumentCharacterTextSplitter.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,379 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "97EiXueJA9cY" | ||
}, | ||
"source": [ | ||
"![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "zmxL_blSA9ce" | ||
}, | ||
"source": [ | ||
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/DocumentCharacterTextSplitter.ipynb)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "uI7yhCibA9cf" | ||
}, | ||
"source": [ | ||
"## Colab + Data Setup" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "4WQLLrIUA9cg", | ||
"outputId": "8db13c0f-d5c0-4af7-c08d-78d1ea820653" | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Installing PySpark 3.2.3 and Spark NLP 5.2.2\n", | ||
"setup Colab for PySpark 3.2.3 and Spark NLP 5.2.2\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.3/547.3 kB\u001b[0m \u001b[31m30.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"id": "nVTDX8SdiSD9" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"!wget https://github.com/JohnSnowLabs/spark-nlp/blob/587f79020de7bc09c2b2fceb37ec258bad57e425/src/test/resources/spell/sherlockholmes.txt > /dev/null 2>&1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "KzMHa0HdA9ch", | ||
"outputId": "f8f8b92a-12d5-45d9-a021-acd48241c4df" | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Spark NLP version 5.2.2\n", | ||
"Apache Spark version: 3.2.3\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import sparknlp\n", | ||
"from sparknlp.base import *\n", | ||
"from sparknlp.annotator import *\n", | ||
"from pyspark.ml import Pipeline\n", | ||
"\n", | ||
"spark = sparknlp.start()\n", | ||
"\n", | ||
"print(f\"Spark NLP version {sparknlp.version()}\\nApache Spark version: {spark.version}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": { | ||
"id": "6qAa9p6ohtfi" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"textDF = spark.read.text(\n", | ||
" \"sherlockholmes.txt\",\n", | ||
" wholetext=True\n", | ||
").toDF(\"text\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "_S-XJDfUA9ci" | ||
}, | ||
"source": [ | ||
"# Download DocumentTokenSplitter Model and Create Spark NLP Pipeline" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "DVHludGFMSCk", | ||
"outputId": "35716c76-f75f-4b53-b9c0-0235ec3d2db4" | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"sparknlp.annotator.document_character_text_splitter.DocumentCharacterTextSplitter" | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"DocumentCharacterTextSplitter" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "O4uPbdrSA9ci" | ||
}, | ||
"source": [ | ||
"Lets create a Spark NLP pipeline with the following stages:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "ASQ5Ot2NA9ci", | ||
"outputId": "eebfcf2d-8674-4cf7-8ece-b41de8f6bb0a" | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"+--------------------------------------------------------------------------------+---------------+-------------+------+\n", | ||
"| result|splits[0].begin|splits[0].end|length|\n", | ||
"+--------------------------------------------------------------------------------+---------------+-------------+------+\n", | ||
"|[{\"payload\":{\"allShortcutsEnabled\":false,\"fileTree\":{\"src/test/resources/spel...| 0| 19998| 19998|\n", | ||
"|[Doctor, and give us your best\",\"attention.\\\"\",\"\",\"A slow and heavy step, whi...| 19806| 39805| 19999|\n", | ||
"|[he said as he turned hungrily on the simple fare that\",\"our landlady had pro...| 39606| 59604| 19998|\n", | ||
"|[armchair and\",\"putting his fingertips together, as was his custom when in\",\"...| 59407| 79406| 19999|\n", | ||
"|[after a time, he did not come in at\",\"all. Still, of course, I never dared t...| 79208| 99201| 19993|\n", | ||
"|[least an hour before us,\\\" he remarked, \\\"for they can\",\"hardly take any ste...| 99003| 119000| 19997|\n", | ||
"|[after father's death, and\",\"a man who was nearly fifteen years younger than ...| 118801| 138782| 19981|\n", | ||
"|[some of the details are of interest. The only drawback\",\"is that there is no...| 138599| 158591| 19992|\n", | ||
"+--------------------------------------------------------------------------------+---------------+-------------+------+\n", | ||
"only showing top 8 rows\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"documentAssembler = DocumentAssembler() \\\n", | ||
" .setInputCol(\"text\") \\\n", | ||
" .setOutputCol(\"document\")\n", | ||
"\n", | ||
"textSplitter = DocumentCharacterTextSplitter() \\\n", | ||
" .setInputCols([\"document\"]) \\\n", | ||
" .setOutputCol(\"splits\") \\\n", | ||
" .setChunkSize(20000) \\\n", | ||
" .setChunkOverlap(200) \\\n", | ||
" .setExplodeSplits(True)\n", | ||
"\n", | ||
"pipeline = Pipeline().setStages([documentAssembler, textSplitter])\n", | ||
"result = pipeline.fit(textDF).transform(textDF)\n", | ||
"\n", | ||
"result.selectExpr(\n", | ||
" \"splits.result\",\n", | ||
" \"splits[0].begin\",\n", | ||
" \"splits[0].end\",\n", | ||
" \"splits[0].end - splits[0].begin as length\").show(8, truncate = 80)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "CALoU6tSofto" | ||
}, | ||
"source": [ | ||
"# Now let's make another pipeline to see if this actually works!" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "H5DFx2DOosri" | ||
}, | ||
"source": [ | ||
"let's get the data ready" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": { | ||
"id": "ZqR7pcQ9pw7a" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"df = spark.createDataFrame([\n", | ||
" [(\"All emotions, and that\\none particularly, were abhorrent to his cold, \"\n", | ||
" \"precise but\\nadmirably balanced mind.\\n\\nHe was, I take it, the most \"\n", | ||
" \"perfect\\nreasoning and observing machine that the world has seen.\")]\n", | ||
"]).toDF(\"text\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "ArsOgKafoft0" | ||
}, | ||
"source": [ | ||
"Lets create a Spark NLP pipeline following the same stages as before:" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": { | ||
"id": "ph0Wm4qqLgVC" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"documentAssembler = DocumentAssembler() \\\n", | ||
" .setInputCol(\"text\") \\\n", | ||
" .setOutputCol(\"document\")\n", | ||
"\n", | ||
"document_character_text_splitter = DocumentCharacterTextSplitter() \\\n", | ||
" .setInputCols(\"document\") \\\n", | ||
" .setOutputCol(\"splits\") \\\n", | ||
" .setChunkSize(20) \\\n", | ||
" .setChunkOverlap(5) \\\n", | ||
" .setExplodeSplits(True) \\\n", | ||
" .setPatternsAreRegex(False) \\\n", | ||
" .setKeepSeparators(True) \\\n", | ||
" .setSplitPatterns([\"\\n\\n\", \"\\n\", \" \", \"\"]) \\\n", | ||
" .setTrimWhitespace(True)\n", | ||
"\n", | ||
"pipeline = Pipeline().setStages([documentAssembler, document_character_text_splitter])\n", | ||
"pipeline_df = pipeline.fit(df).transform(df)\n", | ||
"\n", | ||
"results = pipeline_df.select(\"splits\").collect()\n", | ||
"\n", | ||
"splits = [\n", | ||
" row[\"splits\"][0].result.replace(\"\\n\\n\", \" \").replace(\"\\n\", \" \")\n", | ||
" for row in results\n", | ||
"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "mjUiY6sOp-jY" | ||
}, | ||
"source": [ | ||
"**Evaluation**" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "S_P2TnL-MCqM", | ||
"outputId": "35dcc363-2975-4ab3-cae0-47e10fba8f5f" | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"True" | ||
] | ||
}, | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"expected = [\n", | ||
" \"All emotions, and\",\n", | ||
" \"and that\",\n", | ||
" \"one particularly,\",\n", | ||
" \"were abhorrent to\",\n", | ||
" \"to his cold,\",\n", | ||
" \"precise but\",\n", | ||
" \"admirably balanced\",\n", | ||
" \"mind.\",\n", | ||
" \"He was, I take it,\",\n", | ||
" \"it, the most\",\n", | ||
" \"most perfect\",\n", | ||
" \"reasoning and\",\n", | ||
" \"and observing\",\n", | ||
" \"machine that the\",\n", | ||
" \"the world has seen.\",\n", | ||
"]\n", | ||
"\n", | ||
"splits == expected" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"id": "Wq4G03A2qB5U" | ||
}, | ||
"source": [ | ||
"Great it works!" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"collapsed_sections": [ | ||
"uI7yhCibA9cf" | ||
], | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python [conda env:tempspark]", | ||
"language": "python", | ||
"name": "conda-env-tempspark-py" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.16" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
Oops, something went wrong.