JohnSnowLabs · maziyarpanahi · Apr 5, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/examples/python/annotation/text/english/DocumentCharacterTextSplitter.ipynb b/examples/python/annotation/text/english/DocumentCharacterTextSplitter.ipynb
@@ -0,0 +1,379 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "97EiXueJA9cY"
+      },
+      "source": [
+        "![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zmxL_blSA9ce"
+      },
+      "source": [
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/DocumentCharacterTextSplitter.ipynb)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uI7yhCibA9cf"
+      },
+      "source": [
+        "## Colab + Data Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "4WQLLrIUA9cg",
+        "outputId": "8db13c0f-d5c0-4af7-c08d-78d1ea820653"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Installing PySpark 3.2.3 and Spark NLP 5.2.2\n",
+            "setup Colab for PySpark 3.2.3 and Spark NLP 5.2.2\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.3/547.3 kB\u001b[0m \u001b[31m30.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+          ]
+        }
+      ],
+      "source": [
+        "!wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "nVTDX8SdiSD9"
+      },
+      "outputs": [],
+      "source": [
+        "!wget https://github.com/JohnSnowLabs/spark-nlp/blob/587f79020de7bc09c2b2fceb37ec258bad57e425/src/test/resources/spell/sherlockholmes.txt > /dev/null 2>&1"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "KzMHa0HdA9ch",
+        "outputId": "f8f8b92a-12d5-45d9-a021-acd48241c4df"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Spark NLP version 5.2.2\n",
+            "Apache Spark version: 3.2.3\n"
+          ]
+        }
+      ],
+      "source": [
+        "import sparknlp\n",
+        "from sparknlp.base import *\n",
+        "from sparknlp.annotator import *\n",
+        "from pyspark.ml import Pipeline\n",
+        "\n",
+        "spark = sparknlp.start()\n",
+        "\n",
+        "print(f\"Spark NLP version {sparknlp.version()}\\nApache Spark version: {spark.version}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "6qAa9p6ohtfi"
+      },
+      "outputs": [],
+      "source": [
+        "textDF = spark.read.text(\n",
+        "   \"sherlockholmes.txt\",\n",
+        "    wholetext=True\n",
+        ").toDF(\"text\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_S-XJDfUA9ci"
+      },
+      "source": [
+        "# Download DocumentTokenSplitter Model and Create Spark NLP Pipeline"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DVHludGFMSCk",
+        "outputId": "35716c76-f75f-4b53-b9c0-0235ec3d2db4"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "sparknlp.annotator.document_character_text_splitter.DocumentCharacterTextSplitter"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "DocumentCharacterTextSplitter"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "O4uPbdrSA9ci"
+      },
+      "source": [
+        "Lets create a Spark NLP pipeline with the following stages:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ASQ5Ot2NA9ci",
+        "outputId": "eebfcf2d-8674-4cf7-8ece-b41de8f6bb0a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "+--------------------------------------------------------------------------------+---------------+-------------+------+\n",
+            "|                                                                          result|splits[0].begin|splits[0].end|length|\n",
+            "+--------------------------------------------------------------------------------+---------------+-------------+------+\n",
+            "|[{\"payload\":{\"allShortcutsEnabled\":false,\"fileTree\":{\"src/test/resources/spel...|              0|        19998| 19998|\n",
+            "|[Doctor, and give us your best\",\"attention.\\\"\",\"\",\"A slow and heavy step, whi...|          19806|        39805| 19999|\n",
+            "|[he said as he turned hungrily on the simple fare that\",\"our landlady had pro...|          39606|        59604| 19998|\n",
+            "|[armchair and\",\"putting his fingertips together, as was his custom when in\",\"...|          59407|        79406| 19999|\n",
+            "|[after a time, he did not come in at\",\"all. Still, of course, I never dared t...|          79208|        99201| 19993|\n",
+            "|[least an hour before us,\\\" he remarked, \\\"for they can\",\"hardly take any ste...|          99003|       119000| 19997|\n",
+            "|[after father's death, and\",\"a man who was nearly fifteen years younger than ...|         118801|       138782| 19981|\n",
+            "|[some of the details are of interest. The only drawback\",\"is that there is no...|         138599|       158591| 19992|\n",
+            "+--------------------------------------------------------------------------------+---------------+-------------+------+\n",
+            "only showing top 8 rows\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "documentAssembler = DocumentAssembler() \\\n",
+        "    .setInputCol(\"text\") \\\n",
+        "    .setOutputCol(\"document\")\n",
+        "\n",
+        "textSplitter = DocumentCharacterTextSplitter() \\\n",
+        "    .setInputCols([\"document\"]) \\\n",
+        "    .setOutputCol(\"splits\") \\\n",
+        "    .setChunkSize(20000) \\\n",
+        "    .setChunkOverlap(200) \\\n",
+        "    .setExplodeSplits(True)\n",
+        "\n",
+        "pipeline = Pipeline().setStages([documentAssembler, textSplitter])\n",
+        "result = pipeline.fit(textDF).transform(textDF)\n",
+        "\n",
+        "result.selectExpr(\n",
+        "      \"splits.result\",\n",
+        "      \"splits[0].begin\",\n",
+        "      \"splits[0].end\",\n",
+        "      \"splits[0].end - splits[0].begin as length\").show(8, truncate = 80)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CALoU6tSofto"
+      },
+      "source": [
+        "# Now let's make another pipeline to see if this actually works!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "H5DFx2DOosri"
+      },
+      "source": [
+        "let's get the data ready"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "ZqR7pcQ9pw7a"
+      },
+      "outputs": [],
+      "source": [
+        "df = spark.createDataFrame([\n",
+        "    [(\"All emotions, and that\\none particularly, were abhorrent to his cold, \"\n",
+        "      \"precise but\\nadmirably balanced mind.\\n\\nHe was, I take it, the most \"\n",
+        "      \"perfect\\nreasoning and observing machine that the world has seen.\")]\n",
+        "]).toDF(\"text\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ArsOgKafoft0"
+      },
+      "source": [
+        "Lets create a Spark NLP pipeline following the same stages as before:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "id": "ph0Wm4qqLgVC"
+      },
+      "outputs": [],
+      "source": [
+        "documentAssembler = DocumentAssembler() \\\n",
+        "    .setInputCol(\"text\") \\\n",
+        "    .setOutputCol(\"document\")\n",
+        "\n",
+        "document_character_text_splitter = DocumentCharacterTextSplitter() \\\n",
+        "    .setInputCols(\"document\") \\\n",
+        "    .setOutputCol(\"splits\") \\\n",
+        "    .setChunkSize(20) \\\n",
+        "    .setChunkOverlap(5) \\\n",
+        "    .setExplodeSplits(True) \\\n",
+        "    .setPatternsAreRegex(False) \\\n",
+        "    .setKeepSeparators(True) \\\n",
+        "    .setSplitPatterns([\"\\n\\n\", \"\\n\", \" \", \"\"]) \\\n",
+        "    .setTrimWhitespace(True)\n",
+        "\n",
+        "pipeline = Pipeline().setStages([documentAssembler, document_character_text_splitter])\n",
+        "pipeline_df = pipeline.fit(df).transform(df)\n",
+        "\n",
+        "results = pipeline_df.select(\"splits\").collect()\n",
+        "\n",
+        "splits = [\n",
+        "    row[\"splits\"][0].result.replace(\"\\n\\n\", \" \").replace(\"\\n\", \" \")\n",
+        "    for row in results\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mjUiY6sOp-jY"
+      },
+      "source": [
+        "**Evaluation**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "S_P2TnL-MCqM",
+        "outputId": "35dcc363-2975-4ab3-cae0-47e10fba8f5f"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "True"
+            ]
+          },
+          "execution_count": 15,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "expected = [\n",
+        "    \"All emotions, and\",\n",
+        "    \"and that\",\n",
+        "    \"one particularly,\",\n",
+        "    \"were abhorrent to\",\n",
+        "    \"to his cold,\",\n",
+        "    \"precise but\",\n",
+        "    \"admirably balanced\",\n",
+        "    \"mind.\",\n",
+        "    \"He was, I take it,\",\n",
+        "    \"it, the most\",\n",
+        "    \"most perfect\",\n",
+        "    \"reasoning and\",\n",
+        "    \"and observing\",\n",
+        "    \"machine that the\",\n",
+        "    \"the world has seen.\",\n",
+        "]\n",
+        "\n",
+        "splits == expected"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Wq4G03A2qB5U"
+      },
+      "source": [
+        "Great it works!"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [
+        "uI7yhCibA9cf"
+      ],
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python [conda env:tempspark]",
+      "language": "python",
+      "name": "conda-env-tempspark-py"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.16"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}